import pandas as pd
import cenpy
acs = cenpy.remote.APIConnection("ACSDT5Y2021")
# Search for rows where name contains "Philadelphia"
# counties.loc[ counties[3].str.contains("Philadelphia") ]
variables = ["NAME", "B19013_001E"]
philly_county_code = "101"
pa_state_code = "42"
philly_medhhinc = acs.query(
    cols=variables,
    geo_unit="tract:*",
    geo_filter={"state": pa_state_code, "county": philly_county_code},
)
for variable in variables:
    # Convert all variables EXCEPT for NAME
    if variable != "NAME":
        philly_medhhinc[variable] = pd.to_numeric(philly_medhhinc[variable], errors='coerce')
philly_medhhinc.rename(columns={"B19013_001E": "medhhinc"}, inplace = True)
philly_medhhinc.loc[philly_medhhinc["medhhinc"]<0,"medhhinc"] = 0

philly_medhhinc.head()

#!pip install pygris
import pygris
philly_tracts = pygris.tracts(
    state=pa_state_code, county=philly_county_code, year=2021
)
medhhinc_analysis = philly_medhhinc.merge(
    philly_tracts,
    right_on=["STATEFP", "COUNTYFP", "TRACTCE"],
    left_on=["state", "county", "tract"],
)
medhhinc_analysis = medhhinc_analysis[["NAME_x","medhhinc", "state", "county", "tract", "GEOID", "geometry"]]
medhhinc_analysis.head()

import geopandas as gpd
restaurants = pd.read_json("data/restaurants_philly.json.gz", orient='records', lines=True)
restaurants_gdf = gpd.GeoDataFrame(
    restaurants, geometry = gpd.points_from_xy(restaurants["longitude"], restaurants["latitude"]),
    crs = 'EPSG:4326'
)
restaurants_gdf.head()

restaurants_tract = gpd.sjoin(restaurants_gdf, philly_tracts[["geometry","TRACTCE"]], predicate='within', how='left').drop(columns=["index_right"])
restaurants_tract.head()

C:\Users\19397\AppData\Local\Temp\ipykernel_5904\3302763282.py:1: UserWarning: CRS mismatch between the CRS of left geometries and the CRS of right geometries.
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4326
Right CRS: EPSG:4269

  restaurants_tract = gpd.sjoin(restaurants_gdf, philly_tracts[["geometry","TRACTCE"]], predicate='within', how='left').drop(columns=["index_right"])

restaurants_income = pd.merge(
    restaurants_tract,
    medhhinc_analysis[["medhhinc","tract"]],
    left_on = "TRACTCE",
    right_on = "tract"
)
restaurants_income.head()

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
avg_inc_stars = restaurants_income.groupby("stars")["medhhinc"].mean().reset_index()
avg_inc_stars["stars"] = pd.to_numeric(avg_inc_stars["stars"], errors="coerce")
avg_inc_stars["medhhinc"] = pd.to_numeric(avg_inc_stars["medhhinc"], errors="coerce")

fig,ax = plt.subplots(figsize = (10,6))
sns.lineplot(
    data=avg_inc_stars, 
    x="stars", 
    y="medhhinc", 
    marker="o", 
    ci=95, 
    color='#e78ac3'
)

# Format the plot
plt.title("Average Income by Restaurant Star Rating", fontsize=16, fontweight='bold')
plt.suptitle("Philadelphia, 2021", fontsize = 14)
plt.xlabel("Star Rating", fontsize=14)
plt.ylabel("Average Income", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.grid(alpha=0.3)
plt.tight_layout()

# Show the plot
plt.show()

C:\Users\19397\AppData\Local\Temp\ipykernel_5904\66545563.py:9: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=('ci', 95)` for the same effect.

  sns.lineplot(

restaurants_income['is_fast_food']= restaurants_income["categories"].str.contains("Fast Food", case = False, na = False)
restaurants_income.head()

restaurants_income.groupby("is_fast_food")["medhhinc"].mean()

is_fast_food
False    75153.311165
True     58079.269755
Name: medhhinc, dtype: float64

reviews = pd.read_json("data/reviews_philly_fast_food.json.gz", orient='records', lines=True)
reviews.head()

popular_fast_food = [
    "McDonald's",
    "Wendy's",
    "Subway",
    "Popeyes Louisiana Kitchen",
    "Taco Bell",
    "KFC",
    "Burger King",
]

popular_restaurants = restaurants_gdf[restaurants_gdf["name"].isin(popular_fast_food)]
popular_reviews = reviews[reviews["business_id"].isin(popular_restaurants["business_id"])]
popular_reviews.head()

from transformers import pipeline
sentiment_analysis = popular_reviews.dropna(subset=["text"])
sentiment_analysis_final = sentiment_analysis.loc[sentiment_analysis["text"] != ""]
# Strip out spaces and convert to a list
text = sentiment_analysis_final["text"].str.strip().tolist()

# The model
model = "bhadresh-savani/distilbert-base-uncased-emotion"
# Initialize our sentiment analyzer
emotion_classifier = pipeline(
    task="text-classification",  # The task we are doing
    model=model,  # The specific model name
    top_k=None,  # Predict all labels, not just top ones
    tokenizer=model,  # Tokenize inputs using model tokenizer
    truncation=True,  # Truncate text if we need to
)

# %%time 
emotion_scores = emotion_classifier(text)

emotion = pd.DataFrame(
    [{d["label"]: d["score"] for d in dd} for dd in emotion_scores]
).assign(text=text)
emotion.head()

c:\Users\19397\.conda\envs\musa-550-fall-2023\lib\site-packages\huggingface_hub\file_download.py:1150: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.

emotion_labels = ["anger", "fear", "sadness", "joy", "surprise", "love"]
emotion_pred = emotion
emotion_pred['prediction'] = emotion[emotion_labels].idxmax(axis=1)
emotion_pred.head()

emotion_tidy = emotion_pred
emotion_tidy["score"] = emotion_pred.apply(
    lambda row: row[row["prediction"]], axis=1)
emotion_tidy = emotion_tidy[["text", "prediction", "score"]]

popular_reviews_reset = popular_reviews.reset_index(drop=True)
emotion_restaurants = pd.concat([popular_reviews_reset, emotion_tidy], axis = 1)
emotion_restaurants.rename(columns = {"prediction":"Emotion"}, inplace=True)
emotion_restaurants.head()

# Define the custom palette
custom_palette = ['#fbb4ae','#b3cde3','#ccebc5','#decbe4','#fed9a6','#ffffcc']
ax,fig = plt.subplots(figsize = (10,6))
sns.histplot(
    emotion_restaurants,
    x = "stars",
    hue = "Emotion",
    multiple="stack",
    discrete=True,
    shrink=0.8,  # Adjust width of bars (optional)
    palette=custom_palette,
    edgecolor="white"
)

# Title and labels
plt.title("Emotion Breakdown by Star Rating", fontsize=16, fontweight='bold')
plt.suptitle("(Most popular fast food restaurants in Philadelphia)", fontsize=14)
plt.xlabel("Star Rating", fontsize=14)
plt.ylabel("Count of Reviews", fontsize=14)

# Display the plot
plt.tight_layout()
plt.show()

	NAME_x	medhhinc	state	county	tract	GEOID	geometry
0	Census Tract 1.01, Philadelphia County, Pennsy...	104052	42	101	000101	42101000101	POLYGON ((-75.15221 39.94997, -75.15203 39.950...
1	Census Tract 1.02, Philadelphia County, Pennsy...	91944	42	101	000102	42101000102	POLYGON ((-75.15155 39.95544, -75.15148 39.955...
2	Census Tract 2, Philadelphia County, Pennsylvania	91067	42	101	000200	42101000200	POLYGON ((-75.16289 39.95533, -75.16269 39.956...
3	Census Tract 3, Philadelphia County, Pennsylvania	86782	42	101	000300	42101000300	POLYGON ((-75.17994 39.96003, -75.17942 39.959...
4	Census Tract 4.01, Philadelphia County, Pennsy...	67188	42	101	000401	42101000401	POLYGON ((-75.18091 39.95298, -75.18022 39.954...

	business_id	latitude	longitude	name	review_count	stars	categories	geometry
0	MTSW4McQd7CbVtyjqoe9mw	39.955505	-75.155564	St Honore Pastries	80	4.0	Restaurants, Food, Bubble Tea, Coffee & Tea, B...	POINT (-75.15556 39.95551)
1	MUTTqe8uqyMdBl186RmNeA	39.953949	-75.143226	Tuna Bar	245	4.0	Sushi Bars, Restaurants, Japanese	POINT (-75.14323 39.95395)
2	ROeacJQwBeh05Rqg7F6TCg	39.943223	-75.162568	BAP	205	4.5	Korean, Restaurants	POINT (-75.16257 39.94322)
3	QdN72BWoyFypdGJhhI5r7g	39.939825	-75.157447	Bar One	65	4.0	Cocktail Bars, Bars, Italian, Nightlife, Resta...	POINT (-75.15745 39.93982)
4	Mjboz24M9NlBeiOJKLEd_Q	40.022466	-75.218314	DeSandro on Main	41	3.0	Pizza, Restaurants, Salad, Soup	POINT (-75.21831 40.02247)

	business_id	latitude	longitude	name	review_count	stars	categories	geometry	TRACTCE
0	MTSW4McQd7CbVtyjqoe9mw	39.955505	-75.155564	St Honore Pastries	80	4.0	Restaurants, Food, Bubble Tea, Coffee & Tea, B...	POINT (-75.15556 39.95551)	000200
1	MUTTqe8uqyMdBl186RmNeA	39.953949	-75.143226	Tuna Bar	245	4.0	Sushi Bars, Restaurants, Japanese	POINT (-75.14323 39.95395)	000102
2	ROeacJQwBeh05Rqg7F6TCg	39.943223	-75.162568	BAP	205	4.5	Korean, Restaurants	POINT (-75.16257 39.94322)	001500
3	QdN72BWoyFypdGJhhI5r7g	39.939825	-75.157447	Bar One	65	4.0	Cocktail Bars, Bars, Italian, Nightlife, Resta...	POINT (-75.15745 39.93982)	001800
4	Mjboz24M9NlBeiOJKLEd_Q	40.022466	-75.218314	DeSandro on Main	41	3.0	Pizza, Restaurants, Salad, Soup	POINT (-75.21831 40.02247)	021000

	business_id	latitude	longitude	name	review_count	stars	categories	geometry	TRACTCE	medhhinc	tract
0	MTSW4McQd7CbVtyjqoe9mw	39.955505	-75.155564	St Honore Pastries	80	4.0	Restaurants, Food, Bubble Tea, Coffee & Tea, B...	POINT (-75.15556 39.95551)	000200	91067	000200
1	L_sXNadtVHjxMw7Yhvkj9Q	39.955454	-75.154900	Naoki Ramen	92	4.0	Ramen, Restaurants, Japanese	POINT (-75.15490 39.95545)	000200	91067	000200
2	icp_IKE9zIkAqAucyS1vTA	39.955495	-75.155256	Hakka Beef House	33	4.5	Restaurants, Chinese	POINT (-75.15526 39.95549)	000200	91067	000200
3	8cLbpSZOHnmzxzDNSBGQCA	39.955417	-75.155495	About BBQ	14	4.0	Barbeque, Restaurants	POINT (-75.15549 39.95542)	000200	91067	000200
4	mCo2uVTTGYrEhRrkQW-CMw	39.953940	-75.156309	Empress Garden	263	4.0	Taiwanese, Chinese, Noodles, Restaurants	POINT (-75.15631 39.95394)	000200	91067	000200

	business_id	latitude	longitude	name	review_count	stars	categories	geometry	TRACTCE	medhhinc	tract	is_fast_food
0	MTSW4McQd7CbVtyjqoe9mw	39.955505	-75.155564	St Honore Pastries	80	4.0	Restaurants, Food, Bubble Tea, Coffee & Tea, B...	POINT (-75.15556 39.95551)	000200	91067	000200	False
1	L_sXNadtVHjxMw7Yhvkj9Q	39.955454	-75.154900	Naoki Ramen	92	4.0	Ramen, Restaurants, Japanese	POINT (-75.15490 39.95545)	000200	91067	000200	False
2	icp_IKE9zIkAqAucyS1vTA	39.955495	-75.155256	Hakka Beef House	33	4.5	Restaurants, Chinese	POINT (-75.15526 39.95549)	000200	91067	000200	False
3	8cLbpSZOHnmzxzDNSBGQCA	39.955417	-75.155495	About BBQ	14	4.0	Barbeque, Restaurants	POINT (-75.15549 39.95542)	000200	91067	000200	False
4	mCo2uVTTGYrEhRrkQW-CMw	39.953940	-75.156309	Empress Garden	263	4.0	Taiwanese, Chinese, Noodles, Restaurants	POINT (-75.15631 39.95394)	000200	91067	000200	False

Solutions for Assignment 5: Exploring Yelp Reviews in Philadelphia¶

Luming Xu, 2024-12-04¶

1. Correlating restaurant ratings and income levels¶

1.1 Query the Census API¶

1.2 Download census tracts from the Census and merge the data from part 1.1¶

1.3 Load the restaurants data¶

1.4 Add tract info for each restaurant¶

1.5 Add income data to your restaurant data¶

1.6 Make a plot of median household income vs. Yelp stars¶

2. Fast food trends in Philadelphia¶

2.1 Identify fast food restaurants¶

2.2 Calculate the median income for fast food and otherwise¶

2.3 Load fast food review data¶

2.4 Trim to the most popular fast food restaurants¶

2.5 Run the emotions classifier on fast food reviews¶

2.6 Identify the predicted emotion for each text¶

2.7 Combine the ratings and sentiment data¶

2.8 Plot sentiment vs. stars¶

	business_id	review_id	stars	text
0	kgMEBZG6rjkGeFzPaIM4MQ	E-yGr1OhsUBxNeUVLDVouA	1	I know I shouldn't expect much but everything ...
1	FKrP06TDAKtxNG1vrRQcQQ	0IpFZoaY_RRNjha8Q_Wz6w	2	Perfect place to go if you like waiting 20 mi...
2	w9hS5x1F52Id-G1KTrAOZg	0KlwfaHZyvao41_3S47dyg	2	Was not a fan of their cheesesteak. Their wiz ...
3	fr2qDm_mY1afIGMvqsKUCg	oKSUOq7pCQzyypFDSa1HoA	3	Ok this is an aberration from my city foodie r...
4	fr2qDm_mY1afIGMvqsKUCg	6SMUmb7Npwnq6AusxqOXzQ	5	My family has been customers of George's for y...

	sadness	fear	anger	joy	surprise	love	text
0	0.733869	0.250677	0.011039	0.002758	0.001015	0.000643	I know I shouldn't expect much but everything ...
1	0.000230	0.000126	0.000165	0.998759	0.000246	0.000475	I am only giving 5 stars because the Shamrock ...
2	0.000216	0.000088	0.000153	0.998563	0.000161	0.000819	Dirty bathrooms and very slow service, but I w...
3	0.000838	0.000403	0.000811	0.996928	0.000140	0.000880	Burger King is an okay alternative to Mcdonald...
4	0.005284	0.000753	0.006195	0.985421	0.001620	0.000726	ive tried going here four times with no succes...