```{r}
# compute AUC
performance(predROCR, measure="auc")@y.values[[1]]
```
0.7003633
```{r}
# Evaluate accuracy of the model built using the oversampled training set
# applied to the test set
# obtain probability of defaulting for each observation in test set
lrprobstest <- predict(lrOver, newdata = Test, type = "response")
# obtain predicted class for each observation in test set using threshold of 0.5
lrclasstest <- as.factor(ifelse(lrprobstest > 0.5, "Yes","No"))
```
```{r}
#Create a confusion matrix using "Yes" as the positive class
confusionMatrix(lrclasstest, Test$LoanDefault, positive = "Yes" )
```
```{r}
#Plot ROC Curve for model from oversampled training set using Test set
#create a prediction object to use for the ROC Curve
predROCtest <- prediction(lrprobstest, Test$LoanDefault)
#create a performance object to use for the ROC Curve
perfROCtest <- performance(predROCtest,"tpr", "fpr")
```
```{r}
#plot the ROC Curve
plot(perfROCtest)
abline(a=0, b= 1)
```
```{r}
# compute AUC
performance(predROCtest, measure="auc")@y.values[[1]]
```
0.6888115
```{r}
#predict probability of default for new customers
#read new dataset into R
new_customers <- read.csv("OptivaNewData.csv")
View(new_customers)
```
```{r}
#Convert categorical variables to factors with levels and labels
new_customers$Entrepreneur<-factor(new_customers$Entrepreneur,levels = c(0,1),labels = c("No","Yes"))
new_customers$Unemployed<-factor(new_customers$Unemployed,levels = c(0,1),labels = c("No","Yes"))
new_customers$Married<-factor(new_customers$Married,levels = c(0,1),labels = c("No","Yes"))
new_customers$Divorced<-factor(new_customers$Divorced,levels = c(0,1),labels = c("No","Yes"))
new_customers$HighSchool<-factor(new_customers$HighSchool,levels = c(0,1),labels = c("No","Yes"))
new_customers$College<-factor(new_customers$College,levels = c(0,1),labels = c("No","Yes"))
```
```{r}
# make predictions for new data (for which loan default is unknown)
lrprobsnew <- predict(lrOver, newdata = new_customers , type = "response")
#Attach probability scores to new_customers dataframe
new_customers <- cbind(new_customers, Probabilities=lrprobsnew)
View(new_customers)
```
```{r}
#Modeling a quadratic trend in a time series using polynomial regression
#read dataset into R
lidf <- read.csv("linked_in.csv")
View(lidf)
```
```{r}
#create a time series plot showing number of LinkedIn members by quarter,
#in millions
ggplot(data = lidf, mapping = aes(x = Quarter, y = Members)) +
geom_line (group=1) +
geom_point() +
theme(axis.text.x = element_text(angle = 90)) +
labs(title = "LinkedIn members by Quarter (millions), 2009 to 2014",
x = "Quarter", y = "Members")
```
```{r}
#Add a column of consecutive numbers corresponding with each quarter
lidf$Time <- 1:nrow(lidf)
```
```{r}
#Use simple linear regression analysis to create a regression equation for
#forecasting
lireg<-lm(Members ~ Time, data = lidf)
summary(lireg)
```
```{r}
#Create a vector of predicted values generated from the
#regression above
li_pred = predict(lireg)
#calculate accuracy measures with vector of actual values and vector
#of predicted values as inputs
mae (lidf$Members, li_pred)
mse (lidf$Members, li_pred)
rmse (lidf$Members, li_pred)
mape
[1] 11.50083
[1] 172.6512
[1] 13.13968
[1] 12.65052
```{r}
#Create a new variable that squares the Time variable
lidf$Time2 <- lidf$Time^2
```
```{r}
#Use a quadratic regression model to create a regression equation for
#forecasting
liregquad<-lm(Members ~ Time + Time2, data = lidf)
summary(liregquad)
```
```{r}
#Create a vector of predicted values generated from the
#regression above
li_pred2 = predict(liregquad)
#calculate accuracy measures with vector of actual values and vector
#of predicted values as inputs
mae (lidf$Members, li_pred2)
mse (lidf$Members, li_pred2)
rmse (lidf$Members, li_pred2)
mape (lidf$Members, li_pred2)
```
[1] 1.636153
[1] 3.893917
[1] 1.973301
[1] 1.473033
```{r}
#Predict LinkedIn membership for Quarter 3 and Quarter 4 of 2014
#Create an object with the time periods to use for the prediction
new <- data.frame(Time = c(23, 24), Time2 = c(529, 576))
predict(liregquad, newdata = new)
```
```{r}
#Modeling trend and seasonality in a time series using regression
#read dataset into R
wfdf <- read.csv("whole_foods.csv")
View(wfdf)
```
```{r}
#create a time series plot showing quarterly net sales
ggplot(data = wfdf, mapping = aes(x = Quarter, y = Net.Sales)) +
geom_line (group=1) +
geom_point() +
theme(axis.text.x = element_text(angle = 90))
labs(title = "Whole Foods Quarterly Net Sales 2005 to 2016 in $ millions",
x = "Quarter", y = "Net Sales")
```
```{r}
#Add a column of consecutive numbers corresponding with each year
wfdf$Time <- 1:nrow(wfdf)
```
```{r}
#Create dummy variables corresponding to each quarter
wfdf$Q1 <- ifelse(grepl("Q1",wfdf$Quarter), 1, 0)
wfdf$Q2 <- ifelse(grepl("Q2",wfdf$Quarter), 1, 0)
wfdf$Q3 <- ifelse(grepl("Q3",wfdf$Quarter), 1, 0)
wfdf$Q4 <- ifelse(grepl("Q4",wfdf$Quarter), 1, 0)
```
```{r}
#Use regression with the time variable to generate a regression
#equation for forecasting
wfreg<-lm(Net.Sales ~ Time, data = wfdf)
summary(wfreg)
```
```{r}
#Create a vector of predicted values generated from the
#regression above
wf_pred = predict(wfreg)
```
```{r}
#calculate accuracy measures with vector of actual values and vector
#of predicted values as inputs
mae(wfdf$Net.Sales, wf_pred)
mse(wfdf$Net.Sales, wf_pred)
rmse(wfdf$Net.Sales, wf_pred)
mape(wfdf$Net.Sales, wf_pred)
```
```{r}
#Use multiple regression with the time and quarters variables to generate
#a regression equation for forecasting
wfreg2<-lm(Net.Sales ~ Time + Q2 + Q3 + Q4, data = wfdf)
summary(wfreg2)
```
```{r}
#Create a vector of predicted values generated from the multiple
#regression above
wf_pred2 = predict(wfreg2)
```
```{r}
#calculate accuracy measures with vector of actual values and vector
#of predicted values as inputs
mae(wfdf$Net.Sales, wf_pred2)
mse(wfdf$Net.Sales, wf_pred2)
rmse(wfdf$Net.Sales, wf_pred2)
mape(wfdf$Net.Sales, wf_pred2)
```
[1] 0.1385392
[1] 0.02778254
[1] 0.166681
[1] 6.316223
```{r}
#Predict Whole Foods Net Sales for 2017 Q1, Q2, Q3, Q4
#Create an object with the time periods to use for the prediction
new <- data.frame(Time = c(49, 50, 51, 52), Q2 = c(0,1,0,0), Q3 = c(0,0,1,0),
Q4 = c(0,0,0,1))
predict(wfreg2, newdata = new)
```
```{r}
#Modeling seasonality in a time series using regression
#read dataset into R
nytdf <- read.csv("NYT_revenue.csv")
View(nytdf)
```
```{r}
#create a time series plot showing NYT quarterly revenue
ggplot(data = nytdf, mapping = aes(x = Quarter, y = Revenue)) +
geom_line (group=1) +
geom_point() +
labs(title = "New York Times Quarterly Revenue 2013 to 2016",
x = "Quarter", y = "Revenue")
```
```{r}
#Create dummy variables corresponding to each quarter
#Separate dat into quarter
nytdf$Q1 <- ifelse(grepl("Q1",nytdf$Quarter), 1, 0)
nytdf$Q2 <- ifelse(grepl("Q2",nytdf$Quarter), 1, 0)
nytdf$Q3 <- ifelse(grepl("Q3",nytdf$Quarter), 1, 0)
nytdf$Q4 <- ifelse(grepl("Q4",nytdf$Quarter), 1, 0)
```
```{r}
#Use multiple regression with quarter variables to generate a regression
#equation for forecasting
nytreg<-lm(Revenue ~ Q1 + Q2 + Q3, data = nytdf)
summary(nytreg)
```
Relation to Quarter 4
```{r}
#Predict NYT revenue for 2017 Q1, Q2, Q3, Q4
#Create an object with the time periods to use for the prediction
new <- data.frame(Q1 = c(1,0,0,0), Q2 = c(0,1,0,0), Q3 = c(0,0,1,0))
predict(nytreg, newdata = new)
#install packages
#install.packages ("tidyverse")
```{r}
#load libraries
library(tidyverse)
library(ggplot2)
#set working directory (adjust this for your own computer)
setwd("/Users/myom@cadent.tv/Documents/Eastern/12_DTSC_560_DataScience_for_Business/module4")
```
```{r}
#Modeling a linear time series trend using regression
#read dataset into R
starbucksdf <- read.csv("starbucks_revenue.csv")
View(starbucksdf)
```
```{r}
#create a time series plot showing yearly net revenue in billions
ggplot(data = starbucksdf, mapping = aes(x = Year, y = NetRevenue)) +
geom_line () +
geom_point() +
labs(title = "Starbucks Yearly Net Revenue in Billions of Dollars,
2003 to 2021", x = "Year", y = "Net Revenue")
```
```{r}
#Add a column of consecutive numbers corresponding with each year
starbucksdf$Time <- 1:nrow(starbucksdf)
```
```{r}
#Use simple linear regression analysis to create a regression equation for
#forecasting
sbreg<-lm(NetRevenue ~ Time, data = starbucksdf)
summary(sbreg)
```
Time t in year 1.209 billion per year
```{r}
#Predict Starbucks revenue for 2022
2.547+1.209*20
```
# We can predict year 2022 predict to earn 26.727 billion dollars
```{r}
#Predict Starbucks revenue for 2022, 2023, 2024
#Create a data frame with the time periods to use for the prediction
new <- data.frame(Time = c(20, 21, 22))
predict(sbreg, newdata = new)
```
# predict next 3 years
26.72860 27.93767 29.14674
```{r}
#Create functions for the accuracy measures (we've done this before)
mae<-function(actual,pred){
mae <- mean(abs(actual-pred), na.rm=TRUE)
return (mae)
}
mse<-function(actual,pred){
mse <- mean((actual-pred)^2, na.rm=TRUE)
return (mse)
}
rmse<-function(actual,pred){
rmse <- sqrt(mean((actual-pred)^2, na.rm=TRUE))
return (rmse)
}
mape<-function(actual,pred){
mape <- mean(abs((actual - pred)/actual), na.rm=TRUE)*100
return (mape)
}
```
```{r}
#Create a vector of predicted values generated from the
#regression above (sbreg)
sb_pred = predict(sbreg)
```
```{r}
#Run the accuracy measure functions with vector of actual values and vector
#of predicted values as inputs
mae (starbucksdf$NetRevenue, sb_pred)
mse (starbucksdf$NetRevenue, sb_pred)
rmse (starbucksdf$NetRevenue, sb_pred)
mape (starbucksdf$NetRevenue, sb_pred)
```
[1] 1.411594
[1] 3.440664
[1] 1.854903
[1] 9.219178
```{r}
#Look at residuals from time series regression
#Steps to create a scatterplot of residuals vs. predicted values of the
#dependent variable
#We have already created a vector of predicted values above
#Create a vector of residuals generated from the regression above
sb_res = resid(sbreg)
#Create a data frame of the predicted values and the residuals
pred_res_df <- data.frame(sb_pred, sb_res)
```
```{r}
#create a scatterplot of the residuals versus the predicted values
ggplot(data = pred_res_df, mapping = aes(x = sb_pred, y = sb_res)) +
geom_point() +
labs(title = "Plot of residuals vs. predicted values", x = "Predicted values",
y = "Residuals")
```
```{r}
#install packages
#install.packages ("tidyverse")
#install.packages("TTR")
```
```{r}
#load libraries
library(tidyverse)
library(TTR)
library(ggplot2)
```
```{r}
#set working directory (adjust this for your own computer)
setwd("/Users/myom/Documents/Eastern/12_DTSC_560_DataScience_for_Business/module4")
#read dataset into R
milkdf <- read.csv("united_dairies.csv")
View(milkdf)
```
```{r}
#create a time series plot showing 12 weeks of milk sales
ggplot(data = milkdf, mapping = aes(x = Week, y = Sales)) +
geom_line () +
geom_point() +
scale_x_continuous(breaks = seq(0, 13, by = 1)) +
labs(title = "Weekly milk sales for United Dairies", x = "Week", y = "Sales")
```
```{r}
#create a separate vector for the actual weekly sales
sales_actuals<-milkdf$Sales
sales_actuals
```
[1] 2750 3100 3250 2800 2900 3050 3300 3100 2950 3000 3200 3150
```{r}
#use the naive method to forecast the 13th week of milk sales
naive13 <- c(NA, sales_actuals)
naive13
```
NA 2750 3100 3250 2800 2900 3050 3300 3100 2950 3000 3200 3150
```{r}
#The last value in the vector is the forecast for sales for the 13th week
#Create functions for the accuracy measures with vector of actual values
#and vector of predicted values as inputs
# mean absolute error
mae<-function(actual,pred){
mae <- mean(abs(actual-pred), na.rm=TRUE)
return (mae)
}
# mean square error
mse<-function(actual,pred){
mse <- mean((actual-pred)^2, na.rm=TRUE)
return (mse)
}
# Root-mean-square deviation
rmse<-function(actual,pred){
rmse <- sqrt(mean((actual-pred)^2, na.rm=TRUE))
return (rmse)
}
# Mean Absolute Percentage Error
mape<-function(actual,pred){
mape <- mean(abs((actual - pred)/actual), na.rm=TRUE)*100
return (mape)
}
```
```{r}
#Adjust the vector of predicted values to align with the sales_actuals vector
# removing last value
Naive_pred <- naive13[-length(naive13)]
```
```{r}
#Calculate accuracy measures with vector of actual values and vector
#of predicted values as inputs
mae(sales_actuals, Naive_pred)
mse(sales_actuals, Naive_pred)
rmse(sales_actuals, Naive_pred)
mape(sales_actuals, Naive_pred)
```
[1] 190.9091
[1] 50000
[1] 223.6068
[1] 6.269048
```{r}
# smoothing function. average over 3 weeks next three weeks
# use the simple moving average method to forecast the 13th week of milk sales
sma13<-SMA (sales_actuals, n=3)
sma13
```
NA NA 3033.333 3050.000 2983.333 2916.667 3083.333 3150.000 3116.667 3016.667 3050.000 3116.667
```{r}
#The last value in the vector is the forecast for sales for the 13th week
#Adjust the vector of predicted values to align with the sales_actuals vector
#Remove last value
sales_ma_pred<-c(NA, sma13[-length(sma13)])
sales_ma_pred
```
NA NA NA 3033.333 3050.000 2983.333 2916.667 3083.333 3150.000 3116.667 3016.667 3050.000
```{r}
#Calculate accuracy measures with vector of actual values and vector
#of predicted values as inputs
mae(sales_actuals, sales_ma_pred)
mse(sales_actuals, sales_ma_pred)
rmse(sales_actuals, sales_ma_pred)
mape(sales_actuals, sales_ma_pred)
```
[1] 161.1111
[1] 36203.7
[1] 190.2727
[1] 5.268628
```{r}
#use the exponential smoothing method with alpha = 0.2 to forecast the
#13th week of milk sales
exp13 <- EMA (sales_actuals, n=1, ratio = .2)
exp13
[1] 2750.000 2820.000 2906.000 2884.800 2887.840 2920.272 2996.218 3016.974 3003.579 3002.863 3042.291 3063.833
```{r}
#The last value in the vector is the forecast for sales for the 13th week
#Adjust the vector of predicted values to align with the sales_actuals vector
exp_pred <- c(NA, exp13[-length(exp13)])
exp_pred
```
NA 2750.000 2820.000 2906.000 2884.800 2887.840 2920.272 2996.218 3016.974 3003.579 3002.863 3042.291
```{r}
#Calculate accuracy measures with vector of actual values and vector
#of predicted values as inputs
mape(sales_actuals, exp_pred)
mae(sales_actuals, exp_pred)
mse(sales_actuals, exp_pred)
rmse(sales_actuals, exp_pred)
```
[1] 5.542897
[1] 174.7518
[1] 50462.68
[1] 224.639
```{r}
#use the exponential smoothing method with alpha = 0.4 to forecast the
#13th week of milk sales
# make smoothing little larger to ratio = 0.4
exp13_4 <- EMA (sales_actuals, n=1, ratio = .4)
exp13_4
```
[1] 2750.000 2890.000 3034.000 2940.400 2924.240 2974.544 3104.726 3102.836 3041.702 3025.021 3095.013 3117.008
```{r}
#The last value in the vector is the forecast for sales for the 13th week
#Adjust the vector of predicted values to align with the sales_actuals vector
exp_pred_4 <- c(NA, exp13_4[-length(exp13_4)])
```
```{r}
#Calculate accuracy measures with vector of actual values and vector
#of predicted values as inputs
mae(sales_actuals, exp_pred_4)
mse(sales_actuals, exp_pred_4)
rmse(sales_actuals, exp_pred_4)
mape(sales_actuals, exp_pred_4)
```
[1] 169.5315
[1] 44453.35
[1] 210.8396
[1] 5.4582
#install packages
install.packages ("tidyverse")
#load libraries
library(tidyverse)
#set working directory (adjust this for your own computer)
setwd("/Users/myom@cadent.tv/Documents/Eastern/12_DTSC_560_DataScience_for_Business")
#read dataset into R
cardf <- read.csv("ToyotaCorolla560.csv")
View(cardf)
#recode FuelType variable with 0 for Diesel and 1 for Petrol
cardf$FuelType<-ifelse(cardf$FuelType=="Petrol",1,0)
#Convert categorical variables to factors with levels and labels
cardf$FuelType<-factor(cardf$FuelType,levels = c(0,1),labels = c("Diesel","Petrol"))
cardf$MetColor<-factor(cardf$MetColor,levels = c(0,1),labels = c("No","Yes"))
cardf$Automatic<-factor(cardf$Automatic,levels = c(0,1),labels = c("No","Yes"))
#check for missing data
sum(is.na(cardf))
#generate summary statistics for all variables in dataframe
summary(cardf)
#create a scatterplot showing the relationship between Age and Price and add
#a regression line to the scatterplot
ggplot(data = cardf, mapping = aes(x = Age, y = Price)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "Scatterplot for Car Age and Car Price", x = "Age", y = "Price")
#Calculate a correlation coefficient for the relationship between Age and Price
cor(cardf$Age, cardf$Price)
#Perform a simple linear regression with Car Age and Car Price
toyota_SLR <- lm(Price ~ Age, data = cardf)
#View the simple linear regression output
summary(toyota_SLR)
#create a correlation matrix with all quantitative variables in the dataframe
cor(cardf[c(1, 2, 3, 5)])
#turn off scientific notation for all variables
options(scipen=999)
#Perform a multiple regression with Car Age, KM, and Horsepower and Car Price
toyota_MR <- lm(Price ~ Age + KM + Horsepower, data = cardf)
#View the multiple regression output
summary(toyota_MR)
#install lm.beta package to extract standardized regression coefficients
install.packages ("lm.beta")
#load lm.beta
library(lm.beta)
#Extract standardized regression coefficients
lm.beta(toyota_MR)
#View the multiple regression output
summary(toyota_MR)
#Steps to create a scatterplot of residuals vs. predicted values of the
#dependent variable
#Create a vector of predicted values generated from the multiple
#regression above
toyota_pred = predict(toyota_MR)
#Create a vector of residuals generated from the multiple regression above
toyota_res = resid(toyota_MR)
#Create a data frame of the predicted values and the residuals
pred_res_df <- data.frame(toyota_pred, toyota_res)
#create a scatterplot of the residuals versus the predicted values
ggplot(data = pred_res_df, mapping = aes(x = toyota_pred, y = toyota_res)) +
geom_point() +
labs(title = "Plot of residuals vs. predicted values", x = "Predicted values",
y = "Residuals")
#Steps to create a Normal Probability Plot
#create a vector of standardized residuals generated from the multiple
#regression above
toyota_std.res = rstandard(toyota_MR)
#produce normal scores for the standardized residuals and create
#normal probability plot
qqnorm(toyota_std.res, ylab = "Standardized residuals", xlab = "Normal scores")
#install packages
install.packages ("car")
#load libraries
library(car)
#create a correlation matrix with all quantitative variables in the dataframe
cor(cardf[c(1, 2, 3, 5)])
#calculate Variance Inflation Factor for each variable to assess
#multicollinearity
vif(toyota_MR)
#Perform a multiple regression with Car Age, KM, Horsepower, FuelType,
#MetColor and Automatic as predictor variables and Car Price as the outcome
#variable
toyota_MR_Cat <- lm(Price ~ Age + KM + Horsepower + FuelType + MetColor + Automatic,
data = cardf)
#View multiple regression output
summary(toyota_MR_Cat)
#Steps to create a new scatterplot of residuals vs. predicted values of the
#dependent variable with the new categorical variables added
#Create a vector of predicted values generated from the multiple
#regression above
toyota_pred2 = predict(toyota_MR_Cat)
#Create a vector of residuals generated from the multiple regression above
toyota_res2 = resid(toyota_MR_Cat)
#Create a data frame of the predicted values and the residuals
pred_res_df2 <- data.frame(toyota_pred2, toyota_res2)
#create a scatterplot of the residuals versus the predicted values
ggplot(data = pred_res_df2, mapping = aes(x = toyota_pred2, y = toyota_res2)) +
geom_point() +
labs(title = "Plot of residuals vs. predicted values", x = "Predicted values",
y = "Residuals")
#Steps to create a Normal Probability Plot
#create a vector of standardized residuals generated from the multiple
#regression above
toyota_std.res = rstandard(toyota_MR)
#produce normal scores for the standardized residuals and create
#normal probability plot
qqnorm(toyota_std.res, ylab = "Standardized residuals", xlab = "Normal scores")
#partition the data into a training set and a validation set
#set seed so the random sample is reproducible
set.seed(42)
sample <- sample(c(TRUE, FALSE), nrow(cardf), replace=TRUE, prob=c(0.7,0.3))
traincar <- cardf[sample, ]
validatecar <- cardf[!sample, ]
#Install package needed for best subsets procedure
install.packages("olsrr")
#Load olsrr library
library(olsrr)
#run a multiple regression model using the "train" dataframe and all
#available independent variables
toyota_MR_Alltrain <- lm(Price ~ Age + KM + Horsepower + FuelType + MetColor +
Automatic, data = traincar)
summary(toyota_MR_Alltrain)
#run best subsets procedure with multiple regression output "toyota_MR_Alltrain"
bestsubset <- ols_step_all_possible(toyota_MR_Alltrain)
View(bestsubset)
#run a final multiple regression model using the "validate" dataframe
#and the following predictors: Age, KM, Horsepower, FuelType, and Automatic
toyota_MR_Val <- lm(Price ~ Age + KM + Horsepower + FuelType + Automatic,
data = validatecar)
summary(toyota_MR_Val)
#read inventory dataset into R
inventorydf <- read.csv("toyota_corolla_inventory.csv")
View(inventorydf)
#Convert categorical variables to factors with levels and labels
inventorydf$FuelType<-factor(inventorydf$FuelType,levels = c(0,1),labels = c("Diesel","Petrol"))
inventorydf$Automatic<-factor(inventorydf$Automatic,levels = c(0,1),labels = c("No","Yes"))
#estimate predicted y values and prediction intervals for five additional
#Toyota Corollas in inventory
predict(toyota_MR_Val, inventorydf, interval = "prediction", level = 0.95)