Q = np.full((3, 3), -np.inf) # -inf para ações impossíveis
for state, actions in enumerate(possible_actions):
Q[state, actions] = 0.0 # Valor inicial = 0.0, para todas as ações possíveis
discount_rate = 0.95
n_iterations = 100
for iteration in range(n_iterations):
Q_prev = Q.copy()
for s in range(3):
for a in possible_actions[s]:
Q[s, a] = np.sum([
T[s, a, sp] * (R[s, a, sp] + discount_rate * np.max(Q_prev[sp]))
for sp in range(3)
])
>>> Q
array([[ 21.89498982, 20.80024033, 16.86353093],
[ 1.11669335, ...