-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmdp_policy.py
59 lines (48 loc) · 2.27 KB
/
mdp_policy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import random, time
from mdp import MDP
class MDP_policy(MDP):
def __init__(self, maze=None):
super().__init__(maze)
self.target = [self.goal]
self.values = {state: 0 for state in self.actions.keys()}
self.values[self.target[0]] = pow(10, 7)
self.policyValues = {s: random.choice('N') for s in self.actions.keys()}
self._reward = {state: -40 for state in self.actions.keys()} # LIVING REWARD
self._reward[self.target[0]] = pow(10, 8) # Reward for reaching the target
self.algoPath = {}
self.mainTime = 0
self.explored = set()
def solve(self):
start = time.time()
policyTrigger = True
while policyTrigger:
policyTrigger = False
valueTrigger = True
while valueTrigger:
valueTrigger = False
for state in self.actions.keys():
self.explored.add(state)
if state == self.target[0]:
continue
utilityMax = float('-infinity')
actionMax = None
for action, prob in self.actions[state].items():
for direction in action:
if self.maze.maze_map[state][direction]:
childNode = self.move(state, direction)
utility = super().calculate_PolicyIterationUtility(prob, self._reward, state, childNode, self.values)
if utility > utilityMax:
utilityMax = utility
actionMax = action
self.policyValues[state] = actionMax
self.values[state] = utilityMax
if self.policyValues[state] != actionMax:
policyTrigger = True
self.policyValues[state] = actionMax
node = (self.maze.rows, self.maze.cols)
while node != self.target[0]:
nextNode = self.move(node, self.policyValues[node])
self.algoPath[node] = nextNode
node = nextNode
end = time.time()
return self.algoPath, self.explored, end - start