There are 4 locations (labeled by different letters), and our job is to pick up the passenger at one location and drop him off at another. We receive +20 points for a successful drop-off and lose 1 point for every time-step it takes. There is also a 10 point penalty for illegal pick-up and drop-off actions.
First import all related libraries:
import numpy as np
import gym
import random
import pandas as pd
import spacy
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher
env = gym.make("Taxi-v3")
env.render()
Output:
Fetching Origing, Destination, and Time of Pickup from the sms data:
def fetch_pickup_drop(text_file_path):
# Append All the texts in the List
texts_list=[]
#read the text sms file
df=pd.read_csv(text_file_path,header=None,names=['Sms'])
#print(df.shape[0])
for i in range(0,df.shape[0]):
s=df.iloc[i,0]
texts_list.append(s)
# add the locations through add_pipe
l1=[]
LOCATIONS = ["dwarka sector 23", "dwarka sector 21", "hauz khaas", "airport"]
nlp = spacy.load('en')
matcher = PhraseMatcher(nlp.vocab)
matcher.add("LOCATIONS", None, *list(nlp.pipe(LOCATIONS)))
def places_component(doc):
doc.ents = [Span(doc, start, end, label="GPE") for match_id, start, end in matcher(doc)]
return doc
nlp.add_pipe(places_component) #last=True
#fetch the locations from the texts list of each text and append in the l1 list
for doc in nlp.pipe(texts_list):
l1.append([(ent.text, ent.label_) for ent in doc.ents])
dest=[]
pickup=[]
timing=[]
for i in range(0,len(texts_list)):
str_text=texts_list[i].lower()
str1='for '+l1[i][1][0]
str2='to '+l1[i][1][0]
str3='from '+l1[i][1][0]
## fetch the pickup and drop up location from the texts list of each text sms and append in the destination and pickup list
if str1 in str_text or str2 in str_text:
dest.append(l1[i][1][0])
pickup.append(l1[i][0][0])
elif str3 in str_text:
dest.append(l1[i][0][0])
pickup.append(l1[i][1][0])
# fetch the timing from the texts list of each text and append in the timing list.
if 'am' in str_text:
new_str=str_text[0:str_text.index('am')-1]
n=new_str.rindex(' ')
timing.append(new_str[n+1:]+' AM')
elif 'pm' in str_text:
new_str=str_text[0:str_text.index('pm')-1]
n=new_str.rindex(' ')
timing.append(new_str[n+1:]+' PM')
## create the dataframe of the pickup, Destination and time of pickup
df1 = pd.DataFrame(pickup,columns=['origing'])
df2 = pd.DataFrame(dest,columns=['destination'])
df3 = pd.DataFrame(timing,columns=['time of pickup'])
# concatenate the above three dataframe to get df_final dataframe of sms text file.
df_table_final=pd.concat([df1,df2,df3], axis=1)
return df_table_final
env.reset() # reset environment to a new, random state
env.render()
action_size = env.action_space.n
print("Action size ", action_size)
state_size = env.observation_space.n
print("State size ", state_size)
q_table = np.zeros((state_size, action_size))
print(q_table)
Output:
Training the Agent:
%%time
"""Training the agent"""
import random
from IPython.display import clear_output
# Hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.1
# For plotting metrics
all_epochs = []
all_penalties = []
for i in range(1, 100001):
state = env.reset()
epochs, penalties, reward, = 0, 0, 0
done = False
while not done:
if random.uniform(0, 1) < epsilon:
action = env.action_space.sample() # Explore action space
else:
action = np.argmax(q_table[state]) # Exploit learned values
next_state, reward, done, info = env.step(action)
old_value = q_table[state, action]
next_max = np.max(q_table[next_state])
new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
q_table[state, action] = new_value
if reward == -10:
penalties += 1
state = next_state
epochs += 1
if i % 100 == 0:
clear_output(wait=True)
print(f"Episode: {i}")
print("Training finished.\n")
Output:
Episode: 100000 Training finished. CPU times: user 1min 10s, sys: 13.3 s, total: 1min 24s Wall time: 1min 12s
Distance between pic up and drop location
text_file_path='drive/My Drive/project_2_dataset/sms.txt'
df_original=fetch_pickup_drop(text_file_path)
print(df_original)
# Create a Local Dictionary of city
city=pd.read_csv('drive/My Drive/project_2_dataset/city.csv')
city['mapping']=city['mapping'].map({0:0., 1:1., 2:2., 3:3.})
loc_dict={city.iloc[0,0]:city.iloc[0,1],city.iloc[1,0]:city.iloc[1,1],city.iloc[2,0]:city.iloc[2,1],city.iloc[3,0]:city.iloc[3,1]}
# Change the location by numeric value of city in df_original dataframe
df_original['origing']=df_original['origing'].map(loc_dict)
df_original['destination']=df_original['destination'].map(loc_dict)
Output:
origing destination time of pickup 0 airport hauz khaas 3 PM 1 airport hauz khaas 6 PM 2 hauz khaas dwarka sector 23 1 PM 3 airport hauz khaas 1 AM 4 airport dwarka sector 21 10 PM .. ... ... ... 995 airport dwarka sector 23 2 AM 996 dwarka sector 21 dwarka sector 23 2 PM 997 hauz khaas dwarka sector 21 5 AM 998 airport dwarka sector 23 6 PM 999 airport hauz khaas 1 AM [1000 rows x 3 columns]
Check Pick up and Drop up correction:
#### Check Pick up and Drop up correction
def check_pick_up_drop_correction(pick_up, drop, line_num):
original_origin = int(df_original.iloc[line_num,0])
original_destination = int(df_original.iloc[line_num,1])
if original_origin == pick_up and original_destination == drop:
return True
else:
return False
Evaluate the agent's performance after Q-learning
"""Evaluate the agent's performance after Q-learning"""
total_epochs, total_penalties, wrong_predictions, total_reward = 0, 0, 0, 0
episodes = 1000
for i in range(episodes):
epochs, penalties, reward = 0, 0, 0
#Generate the random state from an enviroment and change the pick up and drop as the fetched one
state = env.reset()
q_table[state][4]=df_original.iloc[i,0]
q_table[state][5]=df_original.iloc[i,1]
done = False
while not done:
action = np.argmax(q_table[state,:])
state, reward, done, info = env.step(action)
epochs += 1
checking = check_pick_up_drop_correction(int(q_table[state][4]), int(q_table[state][5]), i)
if checking == False:
wrong_predictions += 1
reward=-10
penalties += 1
else:
reward=20
total_penalties += penalties
total_epochs += epochs
total_reward += reward
print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")
print(f"Total number of wrong predictions", wrong_predictions)
print("Total Reward is", total_reward)
Output:
Results after 1000 episodes: Average timesteps per episode: 196.365 Average penalties per episode: 0.019 Total number of wrong predictions 19 Total Reward is 19430
Contact us to get instant help related to Reinforcement Machine Learning Projects at: contact@codersarts.com
Comments