This project explores the Airbnb short-term rental market in New York City using data from 2019-2020. It leverages Exploratory Data Analysis (EDA), statistical modeling, and advanced regression techniques to provide insights into pricing strategies, neighborhood trends, and room type preferences.
Key Variables of Interest:
Primary Objectives:
ggplot2
, glmnet
, caret
, table1
, and more
. ├── Data/ │ ├── airbnb_nyc_2019.csv ├── Analysis/ │ ├── scripts/ │ │ ├── eda.R │ │ ├── regression.R │ │ └── visualization.R ├── Visualizations/ │ ├── tableau_dashboard.twb ├── Reports/ │ ├── airbnb_analysis_report.pdf ├── README.md
The Airbnb Analysis Report includes:
Feel free to fork the repository and suggest improvements or enhancements. For questions or collaborations, please reach out via LinkedIn.
Author: Syed Faizan
Master’s Student in Data Analytics and Machine Learning
Northeastern University, Canada
#---------------------------------------------------------# | |
# Airbnb: Analysis of Short-Term Rentals by Syed Faizan # | |
# # | |
# # | |
# # | |
# # | |
# # | |
# # | |
# # | |
# # | |
#---------------------------------------------------------# | |
#Starting with a clean environment---- | |
rm(list=ls()) | |
# Clearing the Console | |
cat("\014") # Clears the console | |
# Clearing scientific notation | |
options(scipen = 999) | |
#Loading the packages utilized for Data cleaning and Data Analysis----- | |
library(tidyverse) | |
library(grid) | |
library(gridExtra) | |
library(dplyr) | |
library(kableExtra) | |
library(ggplot2) | |
library(caret) | |
library(rms) | |
library(DataExplorer) | |
library(dlookr) | |
library(lubridate) | |
library(MASS) | |
library(pROC) | |
library(dplyr) | |
library(table1) | |
# Loading the Data set | |
abnb <- read.csv("airbnb_nyc.csv") | |
# Overview of the Dataset | |
summary(abnb) | |
names(abnb) | |
#View(abnb) | |
abnb <- data.frame(abnb) | |
# Checking for Missing values---- | |
plot_missing(abnb) | |
sum(is.na(abnb)) # 10,052 values missing from the column 'reviews_per_month'. None missing in other columns. | |
# Decision to remove this column along with 'id', 'host_id', 'host_name', 'name' from the Dataset | |
abnb <- abnb %>% | |
dplyr::select( - id, - host_id, - host_name, - name, - reviews_per_month) | |
# Descriptive statistics---- | |
library(psych) | |
psych::describe(abnb) %>% | |
kable() | |
library(vtable) | |
st(abnb) | |
table1(abnb, labels = (table)) | |
?table1() | |
descriptive_table <- abnb %>% | |
diagnose_numeric() | |
descriptive_table | |
# Basic Visualizations prior to Exploratory Data Analysis---- | |
# Normality plots | |
plot_normality(abnb) | |
# Outlier Plot - Note that there are no outliers in 'availability' | |
plot_outlier(abnb[ , c(6:8, 10:11)]) | |
# Note that since 'price' is highly skewed we log transform it prior to visualization. | |
# Plot of log transformed 'Price' by neighborhood | |
abnb %>% | |
ggplot(aes(price, fill = neighbourhood_group)) + | |
geom_histogram(position = "identity", alpha = 0.5, bins = 20) + | |
scale_x_log10(labels = scales::dollar_format()) + | |
labs(fill = NULL, x = "price") | |
# table of price by neighborhood | |
price_by_neighborhood <- abnb %>% | |
group_by(neighbourhood_group) %>% | |
summarise(mean_price = mean(price)) | |
kable(price_by_neighborhood) | |
# Room type by neighborhood visualized | |
ggplot(abnb, aes(neighbourhood_group)) + geom_bar(aes(fill = room_type)) + ggtitle("Room type by Neighborhood group") | |
library(dplyr) | |
# Count istings for each combination of neighbourhood_group and room_type | |
roomtype_by_neighborhood <- abnb %>% | |
group_by(neighbourhood_group, room_type) %>% | |
summarise(total_listings = n(), .groups = 'drop') | |
kable(roomtype_by_neighborhood) | |
# mean price by room type and neighbourhood | |
roomtype_by_neighborhood_meanprice <- abnb %>% | |
group_by(neighbourhood_group, room_type) %>% | |
summarise(mean_price = mean(price), .groups = 'drop') | |
# map by mean price log transformed | |
abnb %>% | |
ggplot(aes(longitude, latitude, z = log(price))) + | |
stat_summary_hex(alpha = 0.8, bins = 70) + | |
scale_fill_viridis_c() + | |
labs(fill = "Mean price (log)") | |
# plot of price by neighborhood as different plots | |
# Plot of log transformed 'Price' by neighborhood | |
abnb %>% | |
ggplot(aes(log(price))) + | |
geom_histogram(aes(y = ..density..), bins = 30, fill = 'purple') + | |
geom_density( alpha = 0.2, fill = 'purple') + | |
scale_x_log10(labels = scales::dollar_format()) + | |
facet_wrap(~ neighbourhood_group) + | |
labs(fill = NULL, x = "Log price by neighborhood") | |
# plot of above average price room types by neighbourood | |
# Calculate the average price once, outside the filter | |
average_price <- mean(abnb$price, na.rm = TRUE) | |
abnb %>% | |
filter(price >= average_price) %>% # Filter rows where price is above average | |
group_by(neighbourhood_group, room_type) %>% | |
tally() %>% | |
ggplot(aes(x = reorder(neighbourhood_group, n), y = n, fill = room_type)) + # Reorder based on count 'n' | |
geom_bar(stat = "identity") + # Specify bar chart | |
labs(title = "Above Average Price by Neighborhood and Room Type", | |
x = "Neighborhood Group", y = "Count of Listings", fill = "Room Type") | |
# Box plots of log price by room type | |
abnb %>% | |
ggplot(aes(x = room_type, y = price)) + | |
geom_boxplot(aes(fill = room_type)) + scale_y_log10() + | |
ylab("Price") + | |
xlab("Room Type") + | |
ggtitle("Boxplots of Price by room type") + | |
geom_hline(yintercept = mean(log(abnb$price)), linetype = 2, color = "purple") | |
# Box plots of log price by neighborhood group | |
abnb %>% | |
ggplot(aes(x = neighbourhood_group, y = price)) + | |
geom_boxplot(aes(fill = neighbourhood_group)) + scale_y_log10() + | |
ylab("Price") + | |
xlab("Neighborhood Group") + | |
ggtitle("Boxplots of Price by Neighbourhood Group") + | |
geom_hline(yintercept = mean(log(abnb$price)), linetype = 2, color = "purple") | |
# Pair plots for the numeric variables | |
# Create a new dataframe with only the numerical variables | |
#Add log transformed price and retain only the numerical variables in the subset abnb_n | |
abnb_n <- abnb %>% | |
mutate(log_price = log1p(price)) | |
abnb_n <- abnb_n %>% | |
select_if(is.numeric) | |
abnb_n <- abnb_n[ , c(4:8)] | |
names(abnb_n) | |
#Scatterplots of the numeric variables | |
library(ggplot2) | |
library(gridExtra) | |
df <- abnb_n | |
# Scatterplot of Log Transformed Price vs. Minimum Nights | |
ggplot(data = df, aes(x = log_price, y = minimum_nights)) + | |
geom_point(color = "red") + | |
labs(title = "Log Transformed Price vs. Minimum Nights", x = "Log Transformed Price", y = "Minimum Nights") | |
# Scatterplot of Log Transformed Price vs. Number of Reviews | |
ggplot(data = df, aes(x = log_price, y = number_of_reviews)) + | |
geom_point(color = "magenta") + | |
labs(title = "Log Transformed Price vs. Number of Reviews", x = "Log Transformed Price", y = "Number of Reviews") | |
# Scatterplot of Log Transformed Price vs. Calculated Host Listings Count | |
ggplot(data = df, aes(x = log_price, y = calculated_host_listings_count)) + | |
geom_point(color = "green") + | |
labs(title = "Log Transformed Price vs. Host Listings Count", x = "Log Transformed Price", y = "Host Listings Count") | |
# Scatterplot of Log Transformed Price vs. Availability 365 | |
ggplot(data = df, aes(x = log_price, y = availability_365)) + | |
geom_point(color = "pink") + | |
labs(title = "Log Transformed Price vs. Availability 365", x = "Log Transformed Price", y = "availability_365") | |
# Using the techniques practiced in ALY 6015 course as required by the assignment rubric | |
# We perform Chi-Square and ANOVA tests between the suitable variables---- | |
# Convert categorical variables to factors | |
abnb$neighbourhood_group <- as.factor(abnb$neighbourhood_group) | |
abnb$room_type <- as.factor(abnb$room_type) | |
# Chi-Square Test of Independence between neighbourhood_group and room_type | |
chi_test_result <- chisq.test(table(abnb$neighbourhood_group, abnb$room_type)) | |
chi_test_result | |
# Contingency table for chi-square test | |
table1(~ room_type | neighbourhood_group, data=abnb, topclass="Rtable1-zebra", caption = "<b>Contengency Table for Chi-square test</b>") | |
# ANOVA for Price by Neighbourhood Group | |
anova_price_ng <- aov(price ~ neighbourhood_group, data = abnb) | |
summary(anova_price_ng) | |
TukeyHSD(anova_price_ng) | |
# ANOVA for Price by Room Type | |
anova_price_rt <- aov(price ~ room_type, data = abnb) | |
summary(anova_price_rt) | |
TukeyHSD(anova_price_rt) | |
# ANOVA for Number of Reviews by Neighbourhood Group | |
anova_reviews_ng <- aov(number_of_reviews ~ neighbourhood_group, data = abnb) | |
summary(anova_reviews_ng) | |
TukeyHSD(anova_reviews_ng) | |
# Two way ANOVA for the interaction between Rooms and Neighbourhood | |
anova_two_way <- aov(price ~ neighbourhood_group*room_type, data = abnb) | |
summary(anova_two_way) | |
TukeyHSD(anova_two_way) | |
# correlation analysis | |
# Confirming the numerical variables data frame | |
names(abnb_n) | |
library(ggcorrplot) | |
cor_matrix <- cor(abnb_n) | |
ggcorrplot(cor_matrix, lab = TRUE) | |
# Linear regression model----- | |
names(abnb_n) | |
model_linear <- lm(log_price ~ . , data = abnb_n) | |
summary(model_linear) | |
plot(model_linear) | |
# Looking for Outliers in the model | |
library(car) | |
outlierTest(model_linear) | |
vif(model_linear) | |
# Logisitic Model to predict price by room type---- | |
# Load necessary libraries | |
library(caret) | |
library(pROC) | |
library(dplyr) | |
# Encoding room type into a binary categorical variable | |
abnb$room_binary <- ifelse(abnb$room_type == "Entire home/apt", "home", "room") | |
abnb$room_binary <- as.factor(abnb$room_binary) | |
# Encoding price into a binary categorical variable | |
threshold <- median(abnb$price) | |
abnb$price_binary <- ifelse(abnb$price > threshold, "high", "low") | |
abnb$price_binary <- as.factor(abnb$price_binary) | |
abnb$price_binary <- relevel(abnb$price_binary, ref = "low") | |
# Convert neighbourhood_group to factor | |
abnb$neighbourhood_group <- as.factor(abnb$neighbourhood_group) | |
# Set seed for reproducibility | |
set.seed(123) | |
# Splitting the data | |
trainIndex <- createDataPartition(abnb$price_binary, p = 0.7, list = FALSE, times = 1) | |
train_data <- abnb[trainIndex,] | |
test_data <- abnb[-trainIndex,] | |
# Fitting logistic regression model on training data | |
logistic_model_train <- glm(price_binary ~ room_binary + minimum_nights + neighbourhood_group + availability_365, | |
data = train_data, family = "binomial") | |
summary(logistic_model_train) | |
train_data %>% | |
group_by(price_binary) %>% | |
count() | |
# Predicting on the train data | |
predicted_probabilities_train <- predict(logistic_model_train, train_data, type = "response") | |
predicted_classes_train <- ifelse(predicted_probabilities_train > 0.5, "high", "low") | |
predicted_classes_train <- factor(predicted_classes_train, levels = c("low", "high")) | |
# Generate the confusion matrix for training data | |
conf_matrix_train <- confusionMatrix(predicted_classes_train, train_data$price_binary, positive = "high") | |
print(conf_matrix_train) | |
# Generating ROC curve for training data | |
roc_object_train <- roc(response = train_data$price_binary, | |
predictor = as.numeric(predicted_probabilities_train), | |
levels = c("low", "high")) | |
# Plotting ROC curve for training data | |
plot(roc_object_train, main = "ROC Curve - Training Data", col = "#1c61b6") | |
auc_value_train <- auc(roc_object_train) | |
print(paste("Area Under the Curve (AUC) - Training Data:", auc_value_train)) | |
text(x = 0.6, y = 0.2, labels = paste("AUC =", round(auc_value_train, 3)), col = "#ff0000", cex = 1.2) | |
# Predicting on the test data | |
predicted_probabilities_test <- predict(logistic_model_train, test_data, type = "response") | |
predicted_classes_test <- ifelse(predicted_probabilities_test > 0.5, "high", "low") | |
predicted_classes_test <- factor(predicted_classes_test, levels = c("low", "high")) | |
# Generate the confusion matrix for test data | |
conf_matrix_test <- confusionMatrix(predicted_classes_test, test_data$price_binary, positive = "high") | |
print(conf_matrix_test) | |
# Generating ROC curve for test data | |
roc_object_test <- roc(response = test_data$price_binary, | |
predictor = as.numeric(predicted_probabilities_test), | |
levels = c("low", "high")) | |
# Plotting ROC curve for test data | |
plot(roc_object_test, main = "ROC Curve - Test Data", col = "#ff0000") | |
auc_value_test <- auc(roc_object_test) | |
print(paste("Area Under the Curve (AUC) - Test Data:", auc_value_test)) | |
text(x = 0.6, y = 0.2, labels = paste("AUC =", round(auc_value_test, 3)), col = "#ff0000", cex = 1.2) | |
# Plotting ROC curves for comparison | |
plot(roc_object_train, main = "ROC Curve Comparison", col = "#1c61b6") | |
text(x = 0.6, y = 0.4, labels = paste("AUC Train =", round(auc_value_train, 3)), col = "#1c61b6", cex = 1.2) | |
lines(roc_object_test, col = "#ff0000") | |
text(x = 0.6, y = 0.3, labels = paste("AUC Test =", round(auc_value_test, 3)), col = "#ff0000", cex = 1.2) | |
legend("bottomright", legend = c("Train", "Test"), col = c("#1c61b6", "#ff0000"), lty = 1) | |
library(knitr) | |
# Create data frame | |
comparison_data <- data.frame( | |
Metric = c("Accuracy", "Kappa", "Sensitivity", "Specificity", "Positive Predictive Value", "Negative Predictive Value", "Prevalence", "Detection Rate", "Balanced Accuracy"), | |
Training = c(0.8208, 0.6416, 0.8339, 0.8077, 0.8123, 0.8297, 0.4995, 0.4165, 0.8208), | |
Testing = c(0.8199, 0.6399, 0.8310, 0.8089, 0.8127, 0.8275, 0.4995, 0.4151, 0.8199) | |
) | |
# Generate the table using kable | |
kable_table <- kable(comparison_data, | |
col.names = c("Metric", "Training Data", "Testing Data"), | |
caption = "Comparison of Metrics for Training and Testing Data", | |
align = c('l', 'c', 'c')) # Use "latex" for LaTeX output, "html" for HTML | |
# Display the table in the console or R Markdown | |
print(kable_table) | |
#Ridge and LASSO regression---- | |
abnb$room_binary <- as.factor(abnb$room_binary) #Factorizing the room binary variable | |
summary(abnb$room_binary) | |
class(abnb$room_binary) #confirming factor | |
abnb %>% | |
group_by(room_type) %>% #confirming the counts of different rooms | |
count() | |
# giving back room binary character entries of 'home' and 'room' to avoid confusion | |
abnb$room_binary <- as.factor(ifelse(abnb$room_type == "Entire home/apt", "home", "room")) | |
abnb$neighbourhood_group <- as.factor(abnb$neighbourhood_group) | |
unique(abnb$neighbourhood_group) | |
relevel(abnb$neighbourhood_group, ref = "Staten Island") | |
library(glmnet) | |
class(abnb$room_binary) | |
summary(abnb$room_binary) #checking in room binary is a factor | |
range(abnbm$price) | |
#creating a new subset for ridge and LASSO modelling | |
names(abnb) | |
abnbm <- abnb %>% | |
select(neighbourhood_group,room_binary,number_of_reviews,price,minimum_nights, availability_365,calculated_host_listings_count) | |
abnbm$neighbourhood_group <- as.factor(abnbm$neighbourhood_group) | |
unique(abnbm$neighbourhood_group) | |
relevel(abnbm$neighbourhood_group, ref = "Staten Island") | |
levels(abnbm$neighbourhood_group) | |
abnbm$room_binary <- as.factor(abnbm$room_binary) | |
unique(abnbm$room_binary) | |
relevel(abnbm$room_binary, ref = "room") | |
trainIndex <- createDataPartition(abnbm$room_binary, p = 0.70, list = FALSE, times = 1) | |
trainData <- abnbm[trainIndex, ] | |
testData <- abnbm[-trainIndex, ] | |
summary(trainData) | |
relevel(trainData$neighbourhood_group, ref = "Staten Island") | |
relevel(testData$neighbourhood_group, ref = "Staten Island") | |
# Preparing the model matrix of predictors and the vector of the response variable | |
train_x <- model.matrix(price ~ . -1, data = trainData) | |
train_y <- trainData$price | |
test_x <- model.matrix(price ~ . -1, data = testData) | |
test_y <- testData$price | |
# Ridge Regression | |
set.seed(314) | |
cv.ridge <- cv.glmnet(x = train_x, y = train_y, alpha = 0, standardize = TRUE) | |
bestlam_ridge <- cv.ridge$lambda.min | |
bestlam_1se_ridge <- cv.ridge$lambda.1se | |
bestlam_ridge | |
log(bestlam_ridge) | |
bestlam_1se_ridge | |
log(bestlam_1se_ridge) | |
plot(cv.ridge) | |
levels(abnbm$neighbourhood_group) | |
ridge.mod <- glmnet(x = train_x, y = train_y, alpha = 0, lambda = bestlam_ridge) | |
coef.ridge <- coef(ridge.mod) | |
ridge.mod | |
coef.ridge | |
plot(glmnet(x = train_x, y = train_y, alpha = 0), xvar = "lambda", label = TRUE) | |
abline(v = log(c(bestlam_ridge, bestlam_1se_ridge)), col = c("green", "purple")) | |
plot(glmnet(x = train_x, y = train_y, alpha = 0), xvar = "dev", label = TRUE) | |
abline(v = log(c(bestlam_ridge, bestlam_1se_ridge)), col = c("green", "purple")) | |
preds_ridge_train <- predict(ridge.mod, newx = train_x) | |
preds_ridge_test <- predict(ridge.mod, newx = test_x) | |
rmse_train_ridge <- sqrt(mean((train_y - preds_ridge_train)^2)) | |
rmse_test_ridge <- sqrt(mean((test_y - preds_ridge_test)^2)) | |
rmse_test_ridge | |
rmse_train_ridge | |
# LASSO | |
set.seed(324) | |
cv.lasso <- cv.glmnet(x = train_x, y = train_y, alpha = 1) | |
bestlam_lasso <- cv.lasso$lambda.min | |
bestlam_1se_lasso <- cv.lasso$lambda.1se | |
plot(cv.lasso) | |
lasso.mod <- glmnet(x = train_x, y = train_y, alpha = 1, lambda = 2.71) | |
coef.lasso <- coef(lasso.mod) | |
coef.lasso | |
plot(glmnet(x = train_x, y = train_y, alpha = 1), xvar = "lambda", label = TRUE) | |
abline(v = log(c(bestlam_lasso, bestlam_1se_lasso)), col = c("blue", "red")) | |
plot(glmnet(x = train_x, y = train_y, alpha = 1), xvar = "dev", label = TRUE) | |
abline(v = log(c(bestlam_lasso, bestlam_1se_lasso)), col = c("blue", "red")) | |
preds_lasso_train <- predict(lasso.mod, newx = train_x) | |
preds_lasso_test <- predict(lasso.mod, newx = test_x) | |
rmse_train_lasso <- sqrt(mean((train_y - preds_lasso_train)^2)) | |
rmse_test_lasso <- sqrt(mean((test_y - preds_lasso_test)^2)) | |
# Outputting RMSE results for comparison | |
cat("Training RMSE Ridge: ", rmse_train_ridge, "") | |
cat("Test RMSE Ridge: ", rmse_test_ridge, "") | |
cat("Training RMSE LASSO: ", rmse_train_lasso, "") | |
cat("Test RMSE LASSO: ", rmse_test_lasso, "") | |
summary(abnbm$room_binary) | |
coef(ridge.mod) | |
summary(ridge.mod) | |
coef(lasso.mod) | |
library(knitr) | |
# Constructing the data frame with RMSE results | |
results_df <- data.frame( | |
Model = c("Ridge", "LASSO"), | |
Training_RMSE = c(rmse_train_ridge, rmse_train_lasso), | |
Testing_RMSE = c(rmse_test_ridge, rmse_test_lasso) | |
) | |
# Using kable to create a formatted table | |
kable_table <- kable(results_df, | |
col.names = c("Model", "Training RMSE", "Testing RMSE"), | |
caption = "Comparison of RMSE Values for Ridge and LASSO Regression Models", | |
align = c('l', 'c', 'c')) # Use "latex" for LaTeX output, "html" for HTML | |
# To display the table in an R Markdown document or similar environment | |
print(kable_table) | |
# Appendix - additional tables with table1 package | |
library(table1) | |
names(abnb) | |
render.median.IQR <- function(x, ...) { | |
if (is.numeric(x)) { | |
# Calculate statistics only if x is numeric | |
c('', | |
`Mean (SD)` = sprintf("%s (%s)", round(mean(x, na.rm = TRUE), 2), round(sd(x, na.rm = TRUE), 2)), | |
`Median [IQR]` = sprintf("%s [%s, %s]", median(x, na.rm = TRUE), | |
quantile(x, 0.25, na.rm = TRUE), quantile(x, 0.75, na.rm = TRUE))) | |
} else { | |
# Count frequencies of each category for non-numeric data | |
levels_counts = table(x) | |
counts = sapply(names(levels_counts), function(lvl) sprintf("%s: %d", lvl, levels_counts[lvl])) | |
c('', `Counts` = paste(counts, collapse = ", ")) | |
} | |
} | |
table1(~ neighbourhood_group + price + number_of_reviews + minimum_nights + availability_365 + calculated_host_listings_count|room_type , data=abnb, topclass="Rtable1-zebra") | |
table1(~ + price + number_of_reviews + minimum_nights + availability_365 + calculated_host_listings_count + room_type| neighbourhood_group, data=abnb, topclass="Rtable1-zebra", render = render.median.IQR) |
© 2025 Syed Faizan. All Rights Reserved.