# import libraries for data manipulation
import numpy as np
import pandas as pd

# import libraries for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

# read the data
df = pd.read_csv('/content/drive/MyDrive/MIT - Data Analytics/Project/foodhub_order.csv')
# returns the first 5 rows
df.head()

df.shape # Write your code here

(1898, 9)

df.info() # Use info() to print a concise summary of the DataFrame

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1898 entries, 0 to 1897
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   order_id               1898 non-null   int64  
 1   customer_id            1898 non-null   int64  
 2   restaurant_name        1898 non-null   object 
 3   cuisine_type           1898 non-null   object 
 4   cost_of_the_order      1898 non-null   float64
 5   day_of_the_week        1898 non-null   object 
 6   rating                 1898 non-null   object 
 7   food_preparation_time  1898 non-null   int64  
 8   delivery_time          1898 non-null   int64  
dtypes: float64(1), int64(4), object(4)
memory usage: 133.6+ KB

df.isnull().sum() # Write your code here

df.describe(include="all")# Write your code here

df ['rating'].value_counts()# Write the code here

print(f"The number of unique order is: {df['order_id'].nunique()}")
print(f"The number of unique customer ID is: {df['customer_id'].nunique()}")
print(f"The number of unique restaurants: {df['restaurant_name'].nunique()}")
print(f"The number of unique cuisine types: {df['cuisine_type'].nunique()}")
print(f"The number of unique day of the week: {df['day_of_the_week'].nunique()}")
print(f"The number of unique rating: {df['rating'].nunique()}")

The number of unique order is: 1898
The number of unique customer ID is: 1200
The number of unique restaurants: 178
The number of unique cuisine types: 14
The number of unique day of the week: 2
The number of unique rating: 4

sns.histplot(data=df,x="cost_of_the_order",kde=True)
plt.show()
sns.boxplot(data=df,x="cost_of_the_order")
plt.show()

plt.figure(figsize=(20,10))
sns.countplot(data = df, x = 'cuisine_type')

<Axes: xlabel='cuisine_type', ylabel='count'>

sns.countplot(data = df, x = 'day_of_the_week')

<Axes: xlabel='day_of_the_week', ylabel='count'>

sns.countplot(data = df, x = 'rating')

<Axes: xlabel='rating', ylabel='count'>

sns.histplot(data=df,x="delivery_time",kde=True)
plt.show()
sns.boxplot(data=df,x="delivery_time")
plt.show()

sns.histplot(data=df,x="food_preparation_time",kde=True)
plt.show()
sns.boxplot(data=df,x="food_preparation_time")
plt.show()

df['restaurant_name'].value_counts().head(5)# Write the code here

df_weekend = df[df['day_of_the_week'] == 'Weekend']
print (f" The most popular cusine on weekend is: {df_weekend['cuisine_type'].value_counts().idxmax()}")
# Write the code here

 The most popular cusine on weekend is: American

df_greater_than_20 = df[df['cost_of_the_order']>20]
percentage = (df_greater_than_20.shape[0] / df.shape[0]) * 100
print("Percentage of orders above 20 dollars:", round(percentage, 2),"%")

Percentage of orders above 20 dollars: 29.24 %

mean_delivery_time = df['delivery_time'].mean() # Write the code here
print('Mean order delivery time:', round(mean_delivery_time, 2), 'minutes')

Mean order delivery time: 24.16 minutes

df['customer_id'].value_counts().head(3)# Write the code here

df['cost_bin'] = pd.cut(df['cost_of_the_order'], bins=range(0, 41, 5))
pivot = df.pivot_table(index='cost_bin', columns='day_of_the_week', values='delivery_time', aggfunc='mean')

# Heatmap
plt.figure(figsize=(8, 5))
sns.heatmap(pivot, annot=True, cmap='YlGnBu')
plt.title("Average Delivery Time by Cost and Day")
plt.ylabel("Cost of the order")
plt.xlabel("Day of the Week")
plt.show()

<ipython-input-146-50a6fe26980d>:2: FutureWarning: The default value of observed=False is deprecated and will change to observed=True in a future version of pandas. Specify observed=False to silence this warning and retain the current behavior
  pivot = df.pivot_table(index='cost_bin', columns='day_of_the_week', values='delivery_time', aggfunc='mean')

plt.figure(figsize=(20,10))
sns.boxplot(x = "cuisine_type", y = "cost_of_the_order", data = df, palette = 'YlGn')
plt.xticks(rotation = 60)
plt.show()

<ipython-input-147-8b83706e854f>:2: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x = "cuisine_type", y = "cost_of_the_order", data = df, palette = 'YlGn')

plt.figure(figsize=(20,10))
sns.boxplot(x = "cuisine_type", y = "delivery_time", data = df, palette = 'PuBu')
plt.xticks(rotation = 60)
plt.show()

<ipython-input-148-b30c527a9826>:2: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x = "cuisine_type", y = "delivery_time", data = df, palette = 'PuBu')

plt.figure(figsize=(20,10))
sns.boxplot(x = "cuisine_type", y = "food_preparation_time", data = df, palette = 'coolwarm')
plt.xticks(rotation = 60)
plt.show()

<ipython-input-149-96bb1e48ecb1>:2: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x = "cuisine_type", y = "food_preparation_time", data = df, palette = 'coolwarm')

df['cost_bucket'] = pd.cut(df['cost_of_the_order'], bins=[0, 15, 30, 45], labels=['Low', 'Medium', 'High'])
sns.violinplot(data=df, x='day_of_the_week', y='delivery_time', hue='cost_bucket', split=True)
plt.title("Delivery Time Spread by Day and Cost Bucket")
plt.show()

plt.figure(figsize=(15, 7))
sns.pointplot(x='rating',y='cost_of_the_order',data=df)
plt.show()

df.groupby(['restaurant_name'])['cost_of_the_order'].sum().sort_values(ascending = False).head(14)

col_list = ['cost_of_the_order', 'food_preparation_time', 'delivery_time']
plt.figure(figsize=(20, 10))
sns.heatmap(df[col_list].corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="coolwarm")
plt.show()

df_rated = df[df['rating'] != 'Not given'].copy()
df_rated['rating'] = df_rated['rating'].astype('int')
df_rating_count = df_rated.groupby(['restaurant_name'])['rating'].count().sort_values(ascending = False).reset_index()
df_rating_count.head()

def compute_rev(x):
    if x > 20:
        return x*0.25
    elif x > 5:
        return x*0.15
    else:
        return x*0

df['Revenue'] = df['cost_of_the_order'].apply(compute_rev)
df.head()

df['total_time'] = df['food_preparation_time'] + df['delivery_time']
df_greater_than_60 = df[df['total_time']>60]
print('The Total orders taking more than 60 minutes are:', df_greater_than_60.shape[0])

percentage = (df_greater_than_60.shape[0] / df.shape[0]) * 100
print("Percentage of orders takinge more than 60 minutes are:", round(percentage, 2), '%')

The Total orders taking more than 60 minutes are: 200
Percentage of orders takinge more than 60 minutes are: 10.54 %

print('Mean delivery time on weekdays is:',
      round(df[df['day_of_the_week'] == 'Weekday']['delivery_time'].mean()),
     'minutes')

print('Mean delivery time on weekend is:',
      round(df[df['day_of_the_week'] == 'Weekend']['delivery_time'].mean()),
     'minutes')

Mean delivery time on weekdays is: 28 minutes
Mean delivery time on weekend is: 22 minutes

	order_id	customer_id	restaurant_name	cuisine_type	cost_of_the_order	day_of_the_week	rating	food_preparation_time	delivery_time
count	1.898000e+03	1898.000000	1898	1898	1898.000000	1898	1898	1898.000000	1898.000000
unique	NaN	NaN	178	14	NaN	2	4	NaN	NaN
top	NaN	NaN	Shake Shack	American	NaN	Weekend	Not given	NaN	NaN
freq	NaN	NaN	219	584	NaN	1351	736	NaN	NaN
mean	1.477496e+06	171168.478398	NaN	NaN	16.498851	NaN	NaN	27.371970	24.161749
std	5.480497e+02	113698.139743	NaN	NaN	7.483812	NaN	NaN	4.632481	4.972637
min	1.476547e+06	1311.000000	NaN	NaN	4.470000	NaN	NaN	20.000000	15.000000
25%	1.477021e+06	77787.750000	NaN	NaN	12.080000	NaN	NaN	23.000000	20.000000
50%	1.477496e+06	128600.000000	NaN	NaN	14.140000	NaN	NaN	27.000000	25.000000
75%	1.477970e+06	270525.000000	NaN	NaN	22.297500	NaN	NaN	31.000000	28.000000
max	1.478444e+06	405334.000000	NaN	NaN	35.410000	NaN	NaN	35.000000	33.000000

	count
rating
Not given	736
5	588
4	386
3	188

	count
restaurant_name
Shake Shack	219
The Meatball Shop	132
Blue Ribbon Sushi	119
Blue Ribbon Fried Chicken	96
Parm	68

	cost_of_the_order
restaurant_name
Shake Shack	3579.53
The Meatball Shop	2145.21
Blue Ribbon Sushi	1903.95
Blue Ribbon Fried Chicken	1662.29
Parm	1112.76
RedFarm Broadway	965.13
RedFarm Hudson	921.21
TAO	834.50
Han Dynasty	755.29
Blue Ribbon Sushi Bar & Grill	666.62
Rubirosa	660.45
Sushi of Gari 46	640.87
Nobu Next Door	623.67
Five Guys Burgers and Fries	506.47

	restaurant_name	rating
0	Shake Shack	133
1	The Meatball Shop	84
2	Blue Ribbon Sushi	73
3	Blue Ribbon Fried Chicken	64
4	RedFarm Broadway	41

	order_id	customer_id	restaurant_name	cuisine_type	cost_of_the_order	day_of_the_week	rating	food_preparation_time	delivery_time
0	1477147	337525	Hangawi	Korean	30.75	Weekend	Not given	25	20
1	1477685	358141	Blue Ribbon Sushi Izakaya	Japanese	12.08	Weekend	Not given	25	23
2	1477070	66393	Cafe Habana	Mexican	12.23	Weekday	5	23	28
3	1477334	106968	Blue Ribbon Fried Chicken	American	29.20	Weekend	3	25	15
4	1478249	76942	Dirty Bird to Go	American	11.59	Weekday	4	25	24

	count
customer_id
52832	13
47440	10
83287	9

Project Foundations for Data Science: FoodHub Data Analysis¶

Context¶

Objective¶

Data Description¶

Data Dictionary¶

Let us start by importing the required libraries¶

Understanding the structure of the data¶

Observations:¶

Question 1: How many rows and columns are present in the data? [0.5 mark]¶

Observations:¶

Question 2: What are the datatypes of the different columns in the dataset? (The info() function can be used) [0.5 mark]¶

Observations:¶

Question 3: Are there any missing values in the data? If yes, treat them using an appropriate method. [1 mark]¶

Observations:¶

Question 4: Check the statistical summary of the data. What is the minimum, average, and maximum time it takes for food to be prepared once an order is placed? [2 marks]¶

Observations:¶

Question 5: How many orders are not rated? [1 mark]¶

Observations:¶

Exploratory Data Analysis (EDA)¶

Univariate Analysis¶

Question 6: Explore all the variables and provide observations on their distributions. (Generally, histograms, boxplots, countplots, etc. are used for univariate exploration.) [9 marks]¶

Question 7: Which are the top 5 restaurants in terms of the number of orders received? [1 mark]¶

Observations:¶

Question 8: Which is the most popular cuisine on weekends? [1 mark]¶

Observations:¶

Question 9: What percentage of the orders cost more than 20 dollars? ¶

Observations:¶

Question 10: What is the mean order delivery time? [1 mark]¶

Observations:¶

Question 11: The company has decided to give 20% discount vouchers to the top 3 most frequent customers. Find the IDs of these customers and the number of orders they placed. [1 mark]¶

Observations:¶

Multivariate Analysis¶

Question 12: Perform a multivariate analysis to explore relationships between the important variables in the dataset. (It is a good idea to explore relations between numerical variables as well as relations between numerical and categorical variables) [10 marks]¶

Observations:¶

Question 14: The company charges the restaurant 25% on the orders having cost greater than 20 dollars and 15% on the orders having cost greater than 5 dollars. Find the net revenue generated by the company across all orders. [3 marks]¶

Observations:¶

Question 15: The company wants to analyze the total time required to deliver the food. What percentage of orders take more than 60 minutes to get delivered from the time the order is placed? (The food has to be prepared and then delivered.) [2 marks]¶

Observations:¶

Question 16: The company wants to analyze the delivery time of the orders on weekdays and weekends. How does the mean delivery time vary during weekdays and weekends? [2 marks]¶

Observations:¶

Conclusion and Recommendations¶

Question 17: What are your conclusions from the analysis? What recommendations would you like to share to help improve the business? (You can use cuisine type and feedback ratings to drive your business recommendations.) [6 marks]¶

Conclusions:¶

Recommendations:¶