# import libraries for data manipulation
import numpy as np
import pandas as pd

# import libraries for data visualization
import matplotlib.pyplot as plt
import seaborn as sns


# read the data
df = pd.read_csv('foodhub_order.csv')
# returns the first 5 rows
df.head()


# Check the number of rows and columns
num_rows, num_cols = df.shape

# Print the results
print("Number of rows:", num_rows)
print("Number of columns:", num_cols)

Number of rows: 1898
Number of columns: 9


# Use info() to print a concise summary of the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1898 entries, 0 to 1897
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   order_id               1898 non-null   int64  
 1   customer_id            1898 non-null   int64  
 2   restaurant_name        1898 non-null   object 
 3   cuisine_type           1898 non-null   object 
 4   cost_of_the_order      1898 non-null   float64
 5   day_of_the_week        1898 non-null   object 
 6   rating                 1898 non-null   object 
 7   food_preparation_time  1898 non-null   int64  
 8   delivery_time          1898 non-null   int64  
dtypes: float64(1), int64(4), object(4)
memory usage: 133.6+ KB


# Check for missing values
df.isnull().sum()

order_id                 0
customer_id              0
restaurant_name          0
cuisine_type             0
cost_of_the_order        0
day_of_the_week          0
rating                   0
food_preparation_time    0
delivery_time            0
dtype: int64


# Calculate the minimum, average, and maximum food preparation time
min_prep_time = df['food_preparation_time'].min()
avg_prep_time = df['food_preparation_time'].mean()
max_prep_time = df['food_preparation_time'].max()

# Print the results
print("Minimum food preparation time:", min_prep_time)
print("Average food preparation time:", avg_prep_time)
print("Maximum food preparation time:", max_prep_time)

Minimum food preparation time: 20
Average food preparation time: 27.371970495258168
Maximum food preparation time: 35


# Count the number of orders that are not rated
num_unrated_orders = df[df['rating'] == 'Not given'].shape[0]

# Print the number of orders that are not rated
print("Number of orders not rated:", num_unrated_orders)# Write your code here

Number of orders not rated: 736


# Histogram for 'cost_of_the_order'
plt.figure(figsize=(8, 6))
sns.histplot(data=df, x='cost_of_the_order')
plt.title('Distribution of Cost of the Order')
plt.xlabel('Cost of the Order')
plt.ylabel('Count')
plt.show()# Write the code here


# Create the histogram
plt.figure(figsize=(10, 6))
plt.hist(df['food_preparation_time'], bins=10, edgecolor='black')
plt.xlabel('Food Preparation Time')
plt.ylabel('Frequency')
plt.title('Histogram of Food Preparation Time')
plt.tight_layout()

# Display the plot
plt.show()


# Combine 'food_preparation_time' and 'delivery_time' into a single DataFrame
time_df = pd.DataFrame({'Food Preparation Time': df['food_preparation_time'],
                        'Delivery Time': df['delivery_time']})

# Box plot for 'food_preparation_time' and 'delivery_time'
plt.figure(figsize=(8, 6))
sns.boxplot(data=time_df)
plt.title('Box Plot of Food Preparation Time and Delivery Time')
plt.ylabel('Time (minutes)')
plt.show()


# Count plot for 'day_of_the_week'
plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='day_of_the_week')
plt.title('Count of Orders by Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Count')
plt.show()


# Count plot for 'rating'
plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='rating')
plt.title('Count of Orders by Rating')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()


# Group the DataFrame by 'restaurant_name' and count the number of orders
restaurant_orders = df.groupby('restaurant_name')['order_id'].count()

# Sort the restaurants in descending order based on the number of orders
top_restaurants = restaurant_orders.sort_values(ascending=False).head(5)

# Print the top 5 restaurants
print("Top 5 restaurants by number of orders:")
print(top_restaurants)

Top 5 restaurants by number of orders:
restaurant_name
Shake Shack                  219
The Meatball Shop            132
Blue Ribbon Sushi            119
Blue Ribbon Fried Chicken     96
Parm                          68
Name: order_id, dtype: int64


# Filter the data for weekends
weekend_data = df[df['day_of_the_week'].isin(['Weekend'])]

# Count the frequency of each cuisine type
cuisine_frequency = weekend_data['cuisine_type'].value_counts()
print("Cuisine frequency is:,", cuisine_frequency)
# Get the most popular cuisine
most_popular_cuisine = cuisine_frequency.idxmax()

# Print the most popular cuisine
print("The most popular cuisine on weekends is:", most_popular_cuisine)

Cuisine frequency is:, cuisine_type
American          415
Japanese          335
Italian           207
Chinese           163
Mexican            53
Indian             49
Mediterranean      32
Middle Eastern     32
Thai               15
French             13
Korean             11
Southern           11
Spanish            11
Vietnamese          4
Name: count, dtype: int64
The most popular cuisine on weekends is: American


# Number of total orders
total_orders = len(df)
print("Total orders:" , total_orders)
# Number of orders above 20$
orders_over_20 = len(df[df['cost_of_the_order'] > 20])
print("Orders over 20$:" , orders_over_20)
# Calculating the percentage
percentage_over_20 = (orders_over_20 / total_orders) * 100
print("Percentage of orders that cost more than $20:", percentage_over_20)

Total orders: 1898
Orders over 20$: 555
Percentage of orders that cost more than $20: 29.24130663856691


# The mean order delivery time
mean_delivery_time = df['delivery_time'].mean()
print("Mean order delivery time:", mean_delivery_time)

Mean order delivery time: 24.161749209694417


# Group the dataset by 'customer_id' and count the number of orders
customer_orders = df.groupby('customer_id')['order_id'].count()
print("Customer orders:\n", customer_orders.head(5))
print()
# Sort the customers in descending order based on the number of orders
top_customers = customer_orders.sort_values(ascending=False).head(3)
print("Top customers:\n", top_customers)
print()
# Get the customer IDs and their corresponding number of orders
customer_ids = top_customers.index.tolist()
order_counts = top_customers.values.tolist()

# Print the IDs and number of orders for the top 3 customers
print("Top 3 most frequent customers:")
for i in range(len(customer_ids)):
    print("Customer ID:", customer_ids[i])
    print("Number of Orders:", order_counts[i])
    print()

Customer orders:
 customer_id
1311    1
5139    1
5444    1
5693    1
6987    2
Name: order_id, dtype: int64

Top customers:
 customer_id
52832    13
47440    10
83287     9
Name: order_id, dtype: int64

Top 3 most frequent customers:
Customer ID: 52832
Number of Orders: 13

Customer ID: 47440
Number of Orders: 10

Customer ID: 83287
Number of Orders: 9


# Selecting the important variables for analysis
important_vars = ['cost_of_the_order', 'food_preparation_time', 'delivery_time', 'cuisine_type', 'rating']

# Creating a DataFrame with the selected variables
selected_df = df[important_vars]

# Scatter plot matrix
sns.pairplot(selected_df)
plt.show()


# Box plot with categorical variable
plt.figure(figsize=(10, 6))
sns.boxplot(x='cuisine_type', y='cost_of_the_order', data=df)
plt.xticks(rotation=45)
plt.title('Box Plot of Cost of Order by Cuisine Type')
plt.show()


# Heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(df[['cost_of_the_order', 'food_preparation_time', 'delivery_time']].corr(), annot=True)
plt.title('Correlation Heatmap')
plt.show()


# Calculate the average rating_numeric for each cuisine_type
average_rating_by_cuisine = df.groupby('cuisine_type')['rating_numeric'].mean()

# Create a bar plot
plt.figure(figsize=(10, 6))
average_rating_by_cuisine.plot(kind='bar')
plt.xlabel('Cuisine Type')
plt.ylabel('Average Rating')
plt.title('Average Rating by Cuisine Type')
plt.xticks(rotation=45)
plt.ylim(3.5, plt.ylim()[1])  # Set the y-axis limits starting from 3
plt.tight_layout()

# Display the plot
plt.show()


# Calculate the revenue for each cuisine by multiplying the count of order_id with the sum of cost_of_the_order
revenue_by_cuisine = df.groupby('cuisine_type').apply(lambda x: x['order_id'].count() * x['cost_of_the_order'].sum()).reset_index()
revenue_by_cuisine.columns = ['Cuisine Type', 'Revenue']

# Sort the DataFrame by revenue in descending order
revenue_by_cuisine = revenue_by_cuisine.sort_values('Revenue', ascending=False)

# Create a bar plot
plt.figure(figsize=(10, 6))
plt.bar(revenue_by_cuisine['Cuisine Type'], revenue_by_cuisine['Revenue'])
plt.xlabel('Cuisine Type')
plt.ylabel('Revenue')
plt.title('Revenue by Cuisine Type (Ordered)')
plt.xticks(rotation=45)
plt.tight_layout()

# Display the plot
plt.show()


# Create the box plot
plt.figure(figsize=(10, 6))
sns.boxplot(x='cuisine_type', y='food_preparation_time', data=df)
plt.xticks(rotation=45)
plt.xlabel('Cuisine Type')
plt.ylabel('Food Preparation Time')
plt.title('Box Plot of Food Preparation Time by Cuisine Type')
plt.tight_layout()

# Display the plot
plt.show()


# Convert 'rating' column to numeric values
df['rating_numeric'] = pd.to_numeric(df['rating'], errors='coerce')

# Filter the data based on the criteria
promotional_restaurants = df.groupby('restaurant_name').filter(lambda x: len(x) > 50 and x['rating_numeric'].mean() > 4)
promotional_restaurants


# Create a pivot table with restaurant name, average of 'rating_numeric', and count of 'order_id'
pivot_table = pd.pivot_table(df, values=['rating_numeric', 'order_id'], 
                             index='restaurant_name', 
                             aggfunc={'rating_numeric': 'mean', 
                                      'order_id': 'count'})
pivot_table


# Filter the pivot table based on the criteria of count of orders > 50 and average rating > 4
filtered_pivot_table = pivot_table[(pivot_table['order_id'] > 50) & (pivot_table['rating_numeric'] > 4)]

# Print the filtered pivot table
print(filtered_pivot_table)

                           order_id  rating_numeric
restaurant_name                                    
Blue Ribbon Fried Chicken        96        4.328125
Blue Ribbon Sushi               119        4.219178
Parm                             68        4.128205
RedFarm Broadway                 59        4.243902
RedFarm Hudson                   55        4.176471
Shake Shack                     219        4.278195
The Meatball Shop               132        4.511905


# Calculate the revenue for each order
df['revenue'] = 0

# Apply the charging percentages
df.loc[df['cost_of_the_order'] > 20, 'revenue'] = df['cost_of_the_order'] * 0.25
df.loc[(df['cost_of_the_order'] > 5) & (df['cost_of_the_order'] <= 20), 'revenue'] = df['cost_of_the_order'] * 0.15

# Calculate the net revenue across all orders
net_revenue = df['revenue'].sum()

# Print the net revenue
print("Net Revenue: $", net_revenue)

Net Revenue: $ 6166.303


# Calculate the total time required for each order (food preparation time + delivery time)
df['total_time'] = df['food_preparation_time'] + df['delivery_time']

# Calculate the percentage of orders that take more than 60 minutes
percentage_above_60 = (df[df['total_time'] > 60].shape[0] / df.shape[0]) * 100

# Print the percentage
print("Percentage of orders taking more than 60 minutes: {:.2f}%".format(percentage_above_60))

Percentage of orders taking more than 60 minutes: 10.54%


# Calculate the mean delivery time for weekdays and weekends
mean_delivery_weekdays = df.loc[df['day_of_the_week'] == 'Weekday', 'delivery_time'].mean()
mean_delivery_weekends = df.loc[df['day_of_the_week'] == 'Weekend', 'delivery_time'].mean()

# Print the mean delivery time for weekdays and weekends
print("Mean Delivery Time on Weekdays: {:.2f} minutes".format(mean_delivery_weekdays))
print("Mean Delivery Time on Weekends: {:.2f} minutes".format(mean_delivery_weekends))

Mean Delivery Time on Weekdays: 28.34 minutes
Mean Delivery Time on Weekends: 22.47 minutes

	order_id	rating_numeric
restaurant_name
'wichcraft	1	5.000000
12 Chairs	4	4.500000
5 Napkin Burger	5	4.000000
67 Burger	1	5.000000
Alidoro	1	NaN
...	...	...
Zero Otto Nove	2	4.000000
brgr	2	3.000000
da Umberto	3	5.000000
ilili Restaurant	18	4.153846
indikitch	4	4.500000

	order_id	customer_id	restaurant_name	cuisine_type	cost_of_the_order	day_of_the_week	rating	food_preparation_time	delivery_time
0	1477147	337525	Hangawi	Korean	30.75	Weekend	Not given	25	20
1	1477685	358141	Blue Ribbon Sushi Izakaya	Japanese	12.08	Weekend	Not given	25	23
2	1477070	66393	Cafe Habana	Mexican	12.23	Weekday	5	23	28
3	1477334	106968	Blue Ribbon Fried Chicken	American	29.20	Weekend	3	25	15
4	1478249	76942	Dirty Bird to Go	American	11.59	Weekday	4	25	24

Project Foundations for Data Science: FoodHub Data Analysis¶

Context¶

Objective¶

Data Description¶

Data Dictionary¶

Let us start by importing the required libraries¶

Understanding the structure of the data¶

Observations:¶

Question 1: How many rows and columns are present in the data? [0.5 mark]¶

Observations:¶

Question 2: What are the datatypes of the different columns in the dataset? (The info() function can be used) [0.5 mark]¶

Observations:¶

Question 3: Are there any missing values in the data? If yes, treat them using an appropriate method. [1 mark]¶

Observations:¶

Question 4: Check the statistical summary of the data. What is the minimum, average, and maximum time it takes for food to be prepared once an order is placed? [2 marks]¶

Observations:¶

Question 5: How many orders are not rated? [1 mark]¶

Observations:¶

Exploratory Data Analysis (EDA)¶

Univariate Analysis¶

Question 6: Explore all the variables and provide observations on their distributions. (Generally, histograms, boxplots, countplots, etc. are used for univariate exploration.) [5 marks]¶

Question 7: Which are the top 5 restaurants in terms of the number of orders received? [1 mark]¶

Observations:¶

Question 8: Which is the most popular cuisine on weekends? [1 mark]¶

Observations:¶

Question 9: What percentage of the orders cost more than 20 dollars? [1 marks]¶

Observations:¶

Question 10: What is the mean order delivery time? [1 mark]¶

Observations:¶

Question 11: The company has decided to give 20% discount vouchers to the top 3 most frequent customers. Find the IDs of these customers and the number of orders they placed. [1 mark]¶

Observations:¶

Multivariate Analysis¶

Question 12: Perform a multivariate analysis to explore relationships between the important variables in the dataset. (It is a good idea to explore relations between numerical variables as well as relations between numerical and categorical variables) [5 marks]¶

Observations:¶

Question 14: The company charges the restaurant 25% on the orders having cost greater than 20 dollars and 15% on the orders having cost greater than 5 dollars. Find the net revenue generated by the company across all orders. [1.5 marks]¶

Observations:¶

Question 15: The company wants to analyze the total time required to deliver the food. What percentage of orders take more than 60 minutes to get delivered from the time the order is placed? (The food has to be prepared and then delivered.) [1 marks]¶

Observations:¶

Question 16: The company wants to analyze the delivery time of the orders on weekdays and weekends. How does the mean delivery time vary during weekdays and weekends? [1 marks]¶

Observations:¶

Conclusion and Recommendations¶

Question 17: What are your conclusions from the analysis? What recommendations would you like to share to help improve the business? (You can use cuisine type and feedback ratings to drive your business recommendations.) [5 marks]¶

Conclusions:¶

Recommendations:¶