This project applies Support Vector Machines (SVM) to the Heart Disease dataset for the classification of patients into two categories: “No Disease” and “Disease.” The goal is to develop a robust predictive model that can assist healthcare professionals in identifying heart disease cases based on clinical and demographic variables.
For a detailed walkthrough of the analysis, refer to the complete project report:
📄 Heart Disease Diagnosis Report
.
├── Data/
│ ├── heart.csv
├── Scripts/
│ ├── Heart_Disease_Diagnosis.R
├── Reports/
│ ├── Heart_Disease_Diagnosis.pdf
├── README.md
Feel free to reach out for feedback, questions, or collaboration opportunities:
LinkedIn: Dr. Syed Faizan
Author: Syed Faizan
Master’s Student in Data Analytics and Machine Learning
| #---------------------------------------------------------# | |
| # Syed Faizan # | |
| # Heart Disease Diagnosis using SVM's # | |
| # # | |
| # # | |
| # # | |
| # # | |
| #---------------------------------------------------------# | |
| #Starting with a clean environment---- | |
| rm(list=ls()) | |
| # Clearing the Console | |
| cat("\014") # Clears the console | |
| # Clearing scientific notation | |
| options(scipen = 999) | |
| #Loading the packages utilized for Data cleaning and Data Analysis----- | |
| library(tidyverse) | |
| library(grid) | |
| library(gridExtra) | |
| library(dplyr) | |
| library(kableExtra) | |
| library(ggplot2) | |
| library(DataExplorer) | |
| library(dlookr) | |
| library(lubridate) | |
| library(table1) | |
| library(psych) | |
| # Loading the initial Data set | |
| heartdf <- read.csv('heart.csv') | |
| str(heartdf) # confirming structure of the data set | |
| summary(heartdf) | |
| head(heartdf) | |
| tail(heartdf) | |
| any(is.na(heartdf)) # Analyzing and visualizing the missing data | |
| sum(is.na(heartdf)) | |
| plot_missing(heartdf) | |
| # Feature Engineering | |
| # Data transformation of target variable to factor | |
| heartdf$target <- factor(heartdf$target, labels = c("NoDisease", "Disease")) | |
| # Defining the label for each variable for clarity in the table | |
| label(heartdf$age) <- "Age" | |
| label(heartdf$sex) <- "Sex" | |
| label(heartdf$cp) <- "Chest Pain Type (CP)" | |
| label(heartdf$trestbps) <- "Resting Blood Pressure (trestbps)" | |
| label(heartdf$chol) <- "Cholesterol (chol)" | |
| label(heartdf$fbs) <- "Fasting Blood Sugar (fbs)" | |
| label(heartdf$restecg) <- "Resting ECG (restecg)" | |
| label(heartdf$thalach) <- "Maximum Heart Rate Achieved (thalach)" | |
| label(heartdf$exang) <- "Exercise-Induced Angina (exang)" | |
| label(heartdf$oldpeak) <- "ST Depression (oldpeak)" | |
| label(heartdf$slope) <- "Slope of ST Segment (slope)" | |
| label(heartdf$ca) <- "Number of Major Vessels (ca)" | |
| label(heartdf$thal) <- "Thalassemia (thal)" | |
| # Convert 'sex' to a factor with appropriate labels | |
| heartdf$sex <- factor(heartdf$sex, levels = c(0, 1), labels = c("Female", "Male")) | |
| # Convert 'cp' (chest pain type) to a factor with appropriate labels | |
| heartdf$cp <- factor(heartdf$cp, levels = c(0, 1, 2, 3), | |
| labels = c("Typical Angina", "Atypical Angina", "Non-Anginal Pain", "Asymptomatic")) | |
| # Convert 'fbs' (fasting blood sugar > 120 mg/dl) to a factor with appropriate labels | |
| heartdf$fbs <- factor(heartdf$fbs, levels = c(0, 1), labels = c("False", "True")) | |
| # Convert 'restecg' to a factor with appropriate labels | |
| heartdf$restecg <- factor(heartdf$restecg, levels = c(0, 1, 2), | |
| labels = c("Normal", "ST-T Wave Abnormality", | |
| "Left Ventricular Hypertrophy")) | |
| # Convert 'exang' (exercise-induced angina) to a factor with appropriate labels | |
| heartdf$exang <- factor(heartdf$exang, levels = c(0, 1), labels = c("No", "Yes")) | |
| # Convert 'slope' to a factor with appropriate labels | |
| heartdf$slope <- factor(heartdf$slope, levels = c(0, 1, 2), | |
| labels = c("Upsloping", "Flat", "Downsloping")) | |
| # Convert 'thal' to a factor with appropriate labels | |
| heartdf$thal <- factor(heartdf$thal, levels = c(0, 1, 2, 3), | |
| labels = c("Unknown", "Normal", "Fixed Defect", "Reversible Defect")) | |
| # Create table1 stratified by the target variable | |
| # Load necessary library | |
| library(table1) | |
| # Categorical variables | |
| table1(~ sex + cp + fbs + restecg + exang + slope + thal | target, | |
| data = heartdf) | |
| # Numerical variables | |
| table1(~ age + trestbps + chol + thalach + oldpeak + ca | target, | |
| data = heartdf) | |
| # Data Visualization | |
| # Create a data frame for categorical variables | |
| cat_df <- heartdf[, c("sex", "cp", "fbs", "restecg", "exang", "slope", "thal", "target")] | |
| # Create a data frame for numerical variables | |
| num_df <- heartdf[, c("age", "trestbps", "chol", "thalach", "oldpeak", "ca")] | |
| # Check the structure of the new data frames | |
| str(cat_df) | |
| str(num_df) | |
| # Checking if the numerical variables need scaling | |
| means <- sapply(num_df[, -7], mean) | |
| stddevs <- sapply(num_df[, -7], sd) | |
| scaling_check <- cbind(means, stddevs) | |
| print(scaling_check) | |
| # Scaling is needed, however it shall be done after visualization | |
| library(dlookr) | |
| library(DataExplorer) | |
| plot_outlier(num_df, col = "red") | |
| plot_normality(num_df, col = "yellow") | |
| plot_bar_category(cat_df) | |
| # Correlation Matrix | |
| library(ggcorrplot) | |
| ggcorrplot(cor(num_df), lab = TRUE) | |
| # Pair Plots | |
| library(GGally) | |
| ggpairs(num_df) | |
| # Scaling | |
| # Ensure the 'target' column is not scaled and is retained as is | |
| num_df <- num_df[, 1:6] # Extract the target column | |
| # Scale the remaining numerical columns (exclude 'target' column) | |
| num_df_scaled <- scale(num_df, center = TRUE, scale = TRUE) | |
| # Combine the scaled data with the 'target' column | |
| num_df <- data.frame(num_df_scaled) | |
| # Ensure 'target' is a factor in num_df | |
| # Convert 'target' in num_df to a factor with levels "NoDisease" and "Disease" | |
| num_df$target <- factor(heartdf$target) | |
| # Divide into test and train | |
| # Set seed for reproducibility | |
| set.seed(314) | |
| # Split the data into 70% training and 30% testing | |
| train_index <- sample(1:nrow(num_df_scaled), size = 0.7 * nrow(num_df_scaled)) | |
| # Create training and testing datasets | |
| train_data <- num_df[train_index, ] | |
| test_data <- num_df[-train_index, ] | |
| # Check dimensions of the train and test sets | |
| dim(train_data) # Should be 70% of the total rows | |
| dim(test_data) # Should be 30% of the total rows | |
| # SVM Implementation | |
| library(e1071) | |
| library(caret) | |
| # Train an SVM model with linear kernel on the training data | |
| svmfit <- svm(target ~ ., data = train_data, kernel = "linear", cost = 10, scale = FALSE) | |
| summary(svmfit) | |
| # Predictions on test data | |
| ypred <- predict(svmfit, newdata = test_data) | |
| # Confusion matrix using confusionMatrix from caret | |
| conf_matrix <- confusionMatrix(ypred, test_data$target) | |
| print(conf_matrix) | |
| # Tune SVM model to find best cost parameter | |
| set.seed(1) | |
| tune.out <- tune(svm, target ~ ., data = train_data, kernel = "linear", | |
| ranges = list(cost = c(0.001, 0.01, 0.1, 1, 10, 100))) | |
| summary(tune.out) | |
| # Use the best model found through tuning | |
| bestmod <- tune.out$best.model | |
| summary(bestmod) | |
| # Predictions using the best model on test data | |
| ypred_best <- predict(bestmod, newdata = test_data) | |
| # Confusion matrix for the best tuned model | |
| conf_matrix_best <- confusionMatrix(ypred_best, test_data$target) | |
| print(conf_matrix_best) | |
| # Support Vector Machine with Radial Kernel | |
| svmfit_radial <- svm(target ~ ., data = train_data, kernel = "radial", gamma = 1, cost = 1) | |
| summary(svmfit_radial) | |
| # Predictions using Radial Kernel SVM | |
| ypred_radial <- predict(svmfit_radial, newdata = test_data) | |
| # Confusion matrix for Radial Kernel | |
| conf_matrix_radial <- confusionMatrix(ypred_radial, test_data$target) | |
| print(conf_matrix_radial) | |
| # Tune SVM with Radial Kernel to find the best cost and gamma parameters | |
| set.seed(144) | |
| tune.out_radial <- tune(svm, target ~ ., data = train_data, kernel = "radial", | |
| ranges = list(cost = c(0.1, 1, 10, 100, 1000), gamma = c(0.5, 1, 2, 3, 4))) | |
| summary(tune.out_radial) | |
| # Use the best radial model found through tuning | |
| bestmod_radial <- tune.out_radial$best.model | |
| # Predictions using the best radial model | |
| ypred_best_radial <- predict(bestmod_radial, newdata = test_data) | |
| # Confusion matrix for the best radial model | |
| conf_matrix_best_radial <- confusionMatrix(ypred_best_radial, test_data$target) | |
| print(conf_matrix_best_radial) | |
| # ROC Curve for the Radial SVM Model | |
| library(ROCR) | |
| rocplot <- function(pred, truth, ...) { | |
| predob <- prediction(pred, truth) | |
| perf <- performance(predob, "tpr", "fpr") | |
| plot(perf, ...) | |
| } | |
| # Obtain decision values for plotting ROC | |
| svmfit_opt <- svm(target ~ ., data = train_data, kernel = "radial", gamma = 2, cost = 1, decision.values = TRUE) | |
| fitted <- attributes(predict(svmfit_opt, train_data, decision.values = TRUE))$decision.values | |
| # Plot ROC Curve for training data | |
| par(mfrow = c(1, 2)) | |
| rocplot(fitted, train_data$target, main = "Training Data (Radial Kernel)") | |
| # Obtain decision values for the test set | |
| fitted_test <- attributes(predict(svmfit_opt, test_data, decision.values = TRUE))$decision.values | |
| # Plot ROC Curve for test data | |
| rocplot(fitted_test, test_data$target, main = "Test Data (Radial Kernel)") | |
| # The Project Ends |



© 2025 Syed Faizan. All Rights Reserved.