import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("C:\\Users\\DeLL\\OneDrive\\Desktop\\Alfido Tech Internship\\zomato.csv")
df.head()

df.shape

(56252, 13)

df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56252 entries, 0 to 56251
Data columns (total 13 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   address                      56235 non-null  object
 1   name                         56236 non-null  object
 2   online_order                 56233 non-null  object
 3   book_table                   56194 non-null  object
 4   rate                         48414 non-null  object
 5   votes                        56174 non-null  object
 6   phone                        54956 non-null  object
 7   location                     56126 non-null  object
 8   rest_type                    55914 non-null  object
 9   dish_liked                   28027 non-null  object
 10  cuisines                     56049 non-null  object
 11  approx_cost(for two people)  55731 non-null  object
 12  listed_in(type)              51642 non-null  object
dtypes: object(13)
memory usage: 5.6+ MB

address                           17
name                              16
online_order                      19
book_table                        58
rate                            7838
votes                             78
phone                           1296
location                         126
rest_type                        338
dish_liked                     28225
cuisines                         203
approx_cost(for two people)      521
listed_in(type)                 4610
dtype: int64

df.drop_duplicates(inplace=True)

df['rate'].unique()[:20]

array(['4.1/5', '3.8/5', '3.7/5', '3.6/5', '4.6/5', '4.0/5',
       '\\nTop floor', '4.2/5', '3.9/5', '3.1/5', '3.0/5', '3.2/5',
       '3.3/5', '2.8/5', " ('Rated 1.0'", " ('Rated 5.0'", '4.4/5',
       ' always a good experience',
       ' a mocktail and did i just hear unlimited desserts?!\\nGoodbye midweek blues!!!?\\n\\nI would specially mention the staff here-Chaitanya',
       '4.3/5'], dtype=object)

import re

def clean_rate(x):
    if isinstance(x, str):
        match = re.search(r'\d+\.\d+', x)
        if match:
            return float(match.group())
    return None

df['rate'] = df['rate'].apply(clean_rate)

df = df[df['rate'].notnull()]

df['rate'].describe()

count    32045.000000
mean         3.714575
std          0.489479
min          0.500000
25%          3.400000
50%          3.800000
75%          4.000000
max          5.000000
Name: rate, dtype: float64

df[df['rate'] < 1]

df = df[(df['rate'] >= 1) & (df['rate'] <= 5)]

df['rate'].describe()

count    32044.000000
mean         3.714675
std          0.489158
min          1.000000
25%          3.400000
50%          3.800000
75%          4.000000
max          5.000000
Name: rate, dtype: float64

df.shape

(32044, 13)

plt.figure(figsize=(8,5))
sns.histplot(df['rate'], bins=20, kde=True)
plt.title("Distribution of Restaurant Ratings")
plt.xlabel("Rating")
plt.ylabel("Frequency")
plt.show()

top_locations = df['location'].value_counts().head(10)

plt.figure(figsize=(10,6))
sns.barplot(x=top_locations.values, y=top_locations.index)
plt.title("Top 10 Restaurant Locations")
plt.xlabel("Number of Restaurants")
plt.ylabel("Location")
plt.show()

top_cuisines = df['cuisines'].value_counts().head(10)

plt.figure(figsize=(10,6))
sns.barplot(x=top_cuisines.values, y=top_cuisines.index)
plt.title("Top 10 Cuisines")
plt.xlabel("Count")
plt.ylabel("Cuisine")
plt.show()

df['approx_cost(for two people)'].dtype

dtype('O')

# Remove commas
df['approx_cost(for two people)'] = df['approx_cost(for two people)'].astype(str)
df['approx_cost(for two people)'] = df['approx_cost(for two people)'].str.replace(',', '')

# Convert to numeric safely
df['approx_cost(for two people)'] = pd.to_numeric(
    df['approx_cost(for two people)'], errors='coerce'
)

# Remove null values
df = df[df['approx_cost(for two people)'].notnull()]

# Remove extreme outliers (very important)
df = df[df['approx_cost(for two people)'] < 5000]

df['approx_cost(for two people)'].describe()

count    30700.000000
mean       613.506450
std        463.014449
min         40.000000
25%        300.000000
50%        500.000000
75%        750.000000
max       4500.000000
Name: approx_cost(for two people), dtype: float64

plt.figure(figsize=(8,5))

sns.scatterplot(
    x='approx_cost(for two people)',
    y='rate',
    data=df.sample(3000)   # only 3000 random rows
)

plt.title("Price vs Rating")
plt.xlabel("Cost for Two")
plt.ylabel("Rating")
plt.show()

df[['rate','approx_cost(for two people)']].corr()

plt.figure(figsize=(8,5))
sns.scatterplot(x='votes', y='rate', data=df.sample(3000))
plt.title("Votes vs Rating")
plt.xlabel("Number of Votes")
plt.ylabel("Rating")
plt.show()

df[['rate','votes']].corr()

top_locations = df['location'].value_counts().head(10)

plt.figure(figsize=(10,6))
sns.barplot(x=top_locations.values, y=top_locations.index)
plt.title("Top 10 Restaurant Locations")
plt.xlabel("Number of Restaurants")
plt.ylabel("Location")
plt.show()

top_cuisines = df['cuisines'].value_counts().head(10)

plt.figure(figsize=(10,6))
sns.barplot(x=top_cuisines.values, y=top_cuisines.index)
plt.title("Top 10 Cuisines")
plt.xlabel("Number of Restaurants")
plt.ylabel("Cuisine")
plt.show()

	address	name	online_order	book_table	rate	votes	phone	location	rest_type	dish_liked	cuisines	approx_cost(for two people)	listed_in(type)
0	942, 21st Main Road, 2nd Stage, Banashankari, ...	Jalsa	Yes	Yes	4.1/5	775	080 42297555\r\n+91 9743772233	Banashankari	Casual Dining	Pasta, Lunch Buffet, Masala Papad, Paneer Laja...	North Indian, Mughlai, Chinese	800	Buffet
1	2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...	Spice Elephant	Yes	No	4.1/5	787	080 41714161	Banashankari	Casual Dining	Momos, Lunch Buffet, Chocolate Nirvana, Thai G...	Chinese, North Indian, Thai	800	Buffet
2	1112, Next to KIMS Medical College, 17th Cross...	San Churro Cafe	Yes	No	3.8/5	918	+91 9663487993	Banashankari	Cafe, Casual Dining	Churros, Cannelloni, Minestrone Soup, Hot Choc...	Cafe, Mexican, Italian	800	Buffet
3	1st Floor, Annakuteera, 3rd Stage, Banashankar...	Addhuri Udupi Bhojana	No	No	3.7/5	88	+91 9620009302	Banashankari	Quick Bites	Masala Dosa	South Indian, North Indian	300	Buffet
4	10, 3rd Floor, Lakshmi Associates, Gandhi Baza...	Grand Village	No	No	3.8/5	166	+91 8026612447\r\n+91 9901210005	Basavanagudi	Casual Dining	Panipuri, Gol Gappe	North Indian, Rajasthani	600	Buffet

Zomato Dataset Analysis¶

Executive Summary¶

1. Objective¶

2. Dataset Description¶

3. Data Understanding¶

4. Data Cleaning¶

5. Exploratory Data Analysis¶

5.1 Distribution of Ratings¶

5.2 Top Restaurant Locations¶

5.3 Most Popular Cuisines¶

5.4 Price vs Rating Analysis¶

5.6 Votes vs Rating Analysis¶

5.7 Top Restaurant Locations¶

5.8 Most Popular Cuisines¶

6. Business Recommendations¶

7. Conclusion¶

	rate	approx_cost(for two people)
rate	1.000000	0.385836
approx_cost(for two people)	0.385836	1.000000