#MachineLearning #ReinforcementLearning
By Billy Gustave
Business challenge/requirement
BluEx is a leading logistics company in India. It's known for efficient delivery of packets to customers. However BluEx is facing a challenge where its van drivers are taking a suboptimal path for delivery. This is causing delays and higher fuel cost.
Goal :
Drive has to make 7 stops (destination 0 to 6), destination 6 is the goal.
The following stop lists were provided:
# List of points/stops route:
points_list = [(0,1), (1,5), (5,6), (5,4), (1,2), (2,3), (2,7)]
mapping={0:'Start', 1:'1', 2:'2', 3:'3', 4:'4', 5:'5', 6:'6', 7:'7-Destination'}
import numpy as np
Visual repesentation of how the points are connected
import networkx as nx, pylab as plt
goal = 7
mapping={0:'Start', 1:'1', 2:'2', 3:'3', 4:'4', 5:'5', 6:'6', 7:'7-Destination'}
G=nx.Graph()
G.add_edges_from(points_list)
pos = nx.spring_layout(G,k=.5,center=points_list[2])
nx.draw_networkx_nodes(G,pos,node_color='g')
nx.draw_networkx_edges(G,pos,edge_color='b')
nx.draw_networkx_labels(G,pos)
plt.show()
8 States
Goal
: stop # 7 -> (2,7)
# Let's get our state and action matrix
R = np.matrix([[-1, 0, -1, -1, -1, -1, -1, -1],
[0, -1, 0, -1, -1, 0, -1, -1],
[-1, 0, -1, 0, -1, -1, -1, 100],
[-1, -1, 0, -1, -1, -1, -1, -1],
[-1, -1, -1, -1, -1, 0, -1, -1],
[-1, 0, -1, -1, 0, -1, 0, -1],
[-1, -1, -1, -1, -1, 0, -1, -1],
[-1, -1, 0, -1, -1, -1, -1, 100]])
R.shape
# Q Matrix
Q = np.matrix(np.zeros([8,8]))
# learning parameter
gamma = 0.85
# Initial state (Usually to be chosen at random)
initial_state = 1
# returns available actions/destinations
def available_actions(state):
current_state_row = R[state,]
av_act = np.where(current_state_row >= 0)[1]
return av_act
# randomly choses an action/destination
def sample_next_action(available_actions_range):
next_action = int(np.random.choice(available_act,1))
return next_action
# updates Q matrix and Q learning based on path chosen
def update(current_state, action, gamma):
max_index = np.where(Q[action,] == np.max(Q[action,]))[1]
if max_index.shape[0] > 1:
max_index = int(np.random.choice(max_index, size = 1))
else:
max_index = int(max_index)
max_value = Q[action, max_index]
Q[current_state, action] = R[current_state, action] + gamma*max_value
# train with 20000 iterations
for i in range(20000):
current_state = np.random.randint(0,int(Q.shape[0]))
available_act = available_actions(current_state)
action = sample_next_action(available_act)
update(current_state,action,gamma)
print("Results:")
print(Q)
print("Results nomalized:")
print(Q/np.max(Q)*100) # normalized
def optimal_path(current_state):
steps = [current_state]
while current_state != 7:
next_step_index = np.where(Q[current_state,] == np.max(Q[current_state,]))[1]
if next_step_index.shape[0] > 1:
next_step_index = int(np.random.choice(next_step_index, size=1))
else:
next_step_index = int(next_step_index)
steps.append(next_step_index)
current_state = next_step_index
print("Optimal path: \n", steps)
optimal_path(0)
optimal_path(5)
optimal_path(3)