R Script

2023-0226 (SUN) Logistic Regression

2023-0226 (SUN) Quadratic Trend Forecasting

2023-0226 (SUN) Seasonality and Trend

2023-0226 (SUN) Seasonality in a time series Forecasting

2023-0226 (SUN) Using Regression for Forecasting

2023-0226 (SUN) Time Series Forecasting

2023-0225 (SAT) Linear Regression

2023-0226 (SUN) Logistic Regression

```{r}

# compute AUC

performance(predROCR, measure="auc")@y.values[[1]]

```

0.7003633

```{r}

# Evaluate accuracy of the model built using the oversampled training set

# applied to the test set

# obtain probability of defaulting for each observation in test set

lrprobstest <- predict(lrOver, newdata = Test, type = "response")

# obtain predicted class for each observation in test set using threshold of 0.5

lrclasstest <- as.factor(ifelse(lrprobstest > 0.5, "Yes","No"))

```

```{r}

#Create a confusion matrix using "Yes" as the positive class

confusionMatrix(lrclasstest, Test$LoanDefault, positive = "Yes" )

```

```{r}

#Plot ROC Curve for model from oversampled training set using Test set

#create a prediction object to use for the ROC Curve

predROCtest <- prediction(lrprobstest, Test$LoanDefault)

#create a performance object to use for the ROC Curve

perfROCtest <- performance(predROCtest,"tpr", "fpr")

```

```{r}

#plot the ROC Curve

plot(perfROCtest)

abline(a=0, b= 1)

```

```{r}

# compute AUC

performance(predROCtest, measure="auc")@y.values[[1]]

```

0.6888115

```{r}

#predict probability of default for new customers

#read new dataset into R

new_customers <- read.csv("OptivaNewData.csv")

View(new_customers)

```

```{r}

#Convert categorical variables to factors with levels and labels

new_customers$Entrepreneur<-factor(new_customers$Entrepreneur,levels = c(0,1),labels = c("No","Yes"))

new_customers$Unemployed<-factor(new_customers$Unemployed,levels = c(0,1),labels = c("No","Yes"))

new_customers$Married<-factor(new_customers$Married,levels = c(0,1),labels = c("No","Yes"))

new_customers$Divorced<-factor(new_customers$Divorced,levels = c(0,1),labels = c("No","Yes"))

new_customers$HighSchool<-factor(new_customers$HighSchool,levels = c(0,1),labels = c("No","Yes"))

new_customers$College<-factor(new_customers$College,levels = c(0,1),labels = c("No","Yes"))

```

```{r}

# make predictions for new data (for which loan default is unknown)

lrprobsnew <- predict(lrOver, newdata = new_customers , type = "response")

#Attach probability scores to new_customers dataframe

new_customers <- cbind(new_customers, Probabilities=lrprobsnew)

View(new_customers)

```

2023-0226 (SUN) Quadratic Trend Forecasting

```{r}

#Modeling a quadratic trend in a time series using polynomial regression

#read dataset into R

lidf <- read.csv("linked_in.csv")

View(lidf)

```

```{r}

#create a time series plot showing number of LinkedIn members by quarter,

#in millions

ggplot(data = lidf, mapping = aes(x = Quarter, y = Members)) +

geom_line (group=1) +

geom_point() +

theme(axis.text.x = element_text(angle = 90)) +

labs(title = "LinkedIn members by Quarter (millions), 2009 to 2014",

x = "Quarter", y = "Members")

```

```{r}

#Add a column of consecutive numbers corresponding with each quarter

lidf$Time <- 1:nrow(lidf)

```

```{r}

#Use simple linear regression analysis to create a regression equation for

#forecasting

lireg<-lm(Members ~ Time, data = lidf)

summary(lireg)

```

```{r}

#Create a vector of predicted values generated from the

#regression above

li_pred = predict(lireg)

#calculate accuracy measures with vector of actual values and vector

#of predicted values as inputs

mae (lidf$Members, li_pred)

mse (lidf$Members, li_pred)

rmse (lidf$Members, li_pred)

mape

[1] 11.50083

[1] 172.6512

[1] 13.13968

[1] 12.65052

```{r}

#Create a new variable that squares the Time variable

lidf$Time2 <- lidf$Time^2

```

```{r}

#Use a quadratic regression model to create a regression equation for

#forecasting

liregquad<-lm(Members ~ Time + Time2, data = lidf)

summary(liregquad)

```

```{r}

#Create a vector of predicted values generated from the

#regression above

li_pred2 = predict(liregquad)

#calculate accuracy measures with vector of actual values and vector

#of predicted values as inputs

mae (lidf$Members, li_pred2)

mse (lidf$Members, li_pred2)

rmse (lidf$Members, li_pred2)

mape (lidf$Members, li_pred2)

```

[1] 1.636153

[1] 3.893917

[1] 1.973301

[1] 1.473033

```{r}

#Predict LinkedIn membership for Quarter 3 and Quarter 4 of 2014

#Create an object with the time periods to use for the prediction

new <- data.frame(Time = c(23, 24), Time2 = c(529, 576))

predict(liregquad, newdata = new)

```

2023-0226 (SUN) Seasonality and Trend

```{r}

#Modeling trend and seasonality in a time series using regression

#read dataset into R

wfdf <- read.csv("whole_foods.csv")

View(wfdf)

```

```{r}

#create a time series plot showing quarterly net sales

ggplot(data = wfdf, mapping = aes(x = Quarter, y = Net.Sales)) +

geom_line (group=1) +

geom_point() +

theme(axis.text.x = element_text(angle = 90))

labs(title = "Whole Foods Quarterly Net Sales 2005 to 2016 in $ millions",

x = "Quarter", y = "Net Sales")

```

```{r}

#Add a column of consecutive numbers corresponding with each year

wfdf$Time <- 1:nrow(wfdf)

```

```{r}

#Create dummy variables corresponding to each quarter

wfdf$Q1 <- ifelse(grepl("Q1",wfdf$Quarter), 1, 0)

wfdf$Q2 <- ifelse(grepl("Q2",wfdf$Quarter), 1, 0)

wfdf$Q3 <- ifelse(grepl("Q3",wfdf$Quarter), 1, 0)

wfdf$Q4 <- ifelse(grepl("Q4",wfdf$Quarter), 1, 0)

```

```{r}

#Use regression with the time variable to generate a regression

#equation for forecasting

wfreg<-lm(Net.Sales ~ Time, data = wfdf)

summary(wfreg)

```

```{r}

#Create a vector of predicted values generated from the

#regression above

wf_pred = predict(wfreg)

```

```{r}

#calculate accuracy measures with vector of actual values and vector

#of predicted values as inputs

mae(wfdf$Net.Sales, wf_pred)

mse(wfdf$Net.Sales, wf_pred)

rmse(wfdf$Net.Sales, wf_pred)

mape(wfdf$Net.Sales, wf_pred)

```

```{r}

#Use multiple regression with the time and quarters variables to generate

#a regression equation for forecasting

wfreg2<-lm(Net.Sales ~ Time + Q2 + Q3 + Q4, data = wfdf)

summary(wfreg2)

```

```{r}

#Create a vector of predicted values generated from the multiple

#regression above

wf_pred2 = predict(wfreg2)

```

```{r}

#calculate accuracy measures with vector of actual values and vector

#of predicted values as inputs

mae(wfdf$Net.Sales, wf_pred2)

mse(wfdf$Net.Sales, wf_pred2)

rmse(wfdf$Net.Sales, wf_pred2)

mape(wfdf$Net.Sales, wf_pred2)

```

[1] 0.1385392

[1] 0.02778254

[1] 0.166681

[1] 6.316223

```{r}

#Predict Whole Foods Net Sales for 2017 Q1, Q2, Q3, Q4

#Create an object with the time periods to use for the prediction

new <- data.frame(Time = c(49, 50, 51, 52), Q2 = c(0,1,0,0), Q3 = c(0,0,1,0),

Q4 = c(0,0,0,1))

predict(wfreg2, newdata = new)

```

2023-0226 (SUN) Seasonality in a time series Forecasting

```{r}

#Modeling seasonality in a time series using regression

#read dataset into R

nytdf <- read.csv("NYT_revenue.csv")

View(nytdf)

```

```{r}

#create a time series plot showing NYT quarterly revenue

ggplot(data = nytdf, mapping = aes(x = Quarter, y = Revenue)) +

geom_line (group=1) +

geom_point() +

labs(title = "New York Times Quarterly Revenue 2013 to 2016",

x = "Quarter", y = "Revenue")

```

```{r}

#Create dummy variables corresponding to each quarter

#Separate dat into quarter

nytdf$Q1 <- ifelse(grepl("Q1",nytdf$Quarter), 1, 0)

nytdf$Q2 <- ifelse(grepl("Q2",nytdf$Quarter), 1, 0)

nytdf$Q3 <- ifelse(grepl("Q3",nytdf$Quarter), 1, 0)

nytdf$Q4 <- ifelse(grepl("Q4",nytdf$Quarter), 1, 0)

```

```{r}

#Use multiple regression with quarter variables to generate a regression

#equation for forecasting

nytreg<-lm(Revenue ~ Q1 + Q2 + Q3, data = nytdf)

summary(nytreg)

```

Relation to Quarter 4

```{r}

#Predict NYT revenue for 2017 Q1, Q2, Q3, Q4

#Create an object with the time periods to use for the prediction

new <- data.frame(Q1 = c(1,0,0,0), Q2 = c(0,1,0,0), Q3 = c(0,0,1,0))

predict(nytreg, newdata = new)

2023-0226 (SUN) Using Regression for Forecasting

#install packages

#install.packages ("tidyverse")

```{r}

#load libraries

library(tidyverse)

library(ggplot2)

#set working directory (adjust this for your own computer)

setwd("/Users/myom@cadent.tv/Documents/Eastern/12_DTSC_560_DataScience_for_Business/module4")

```

```{r}

#Modeling a linear time series trend using regression

#read dataset into R

starbucksdf <- read.csv("starbucks_revenue.csv")

View(starbucksdf)

```

```{r}

#create a time series plot showing yearly net revenue in billions

ggplot(data = starbucksdf, mapping = aes(x = Year, y = NetRevenue)) +

geom_line () +

geom_point() +

labs(title = "Starbucks Yearly Net Revenue in Billions of Dollars,

2003 to 2021", x = "Year", y = "Net Revenue")

```

```{r}

#Add a column of consecutive numbers corresponding with each year

starbucksdf$Time <- 1:nrow(starbucksdf)

```

```{r}

#Use simple linear regression analysis to create a regression equation for

#forecasting

sbreg<-lm(NetRevenue ~ Time, data = starbucksdf)

summary(sbreg)

```

Time t in year 1.209 billion per year

```{r}

#Predict Starbucks revenue for 2022

2.547+1.209*20

```

# We can predict year 2022 predict to earn 26.727 billion dollars

```{r}

#Predict Starbucks revenue for 2022, 2023, 2024

#Create a data frame with the time periods to use for the prediction

new <- data.frame(Time = c(20, 21, 22))

predict(sbreg, newdata = new)

```

# predict next 3 years

26.72860 27.93767 29.14674

```{r}

#Create functions for the accuracy measures (we've done this before)

mae<-function(actual,pred){

mae <- mean(abs(actual-pred), na.rm=TRUE)

return (mae)

}

mse<-function(actual,pred){

mse <- mean((actual-pred)^2, na.rm=TRUE)

return (mse)

}

rmse<-function(actual,pred){

rmse <- sqrt(mean((actual-pred)^2, na.rm=TRUE))

return (rmse)

}

mape<-function(actual,pred){

mape <- mean(abs((actual - pred)/actual), na.rm=TRUE)*100

return (mape)

}

```

```{r}

#Create a vector of predicted values generated from the

#regression above (sbreg)

sb_pred = predict(sbreg)

```

```{r}

#Run the accuracy measure functions with vector of actual values and vector

#of predicted values as inputs

mae (starbucksdf$NetRevenue, sb_pred)

mse (starbucksdf$NetRevenue, sb_pred)

rmse (starbucksdf$NetRevenue, sb_pred)

mape (starbucksdf$NetRevenue, sb_pred)

```

[1] 1.411594

[1] 3.440664

[1] 1.854903

[1] 9.219178

```{r}

#Look at residuals from time series regression

#Steps to create a scatterplot of residuals vs. predicted values of the

#dependent variable

#We have already created a vector of predicted values above

#Create a vector of residuals generated from the regression above

sb_res = resid(sbreg)

#Create a data frame of the predicted values and the residuals

pred_res_df <- data.frame(sb_pred, sb_res)

```

```{r}

#create a scatterplot of the residuals versus the predicted values

ggplot(data = pred_res_df, mapping = aes(x = sb_pred, y = sb_res)) +

geom_point() +

labs(title = "Plot of residuals vs. predicted values", x = "Predicted values",

y = "Residuals")

```

2023-0226 (SUN) Time Series Forecasting

```{r}

#install packages

#install.packages ("tidyverse")

#install.packages("TTR")

```

```{r}

#load libraries

library(tidyverse)

library(TTR)

library(ggplot2)

```

```{r}

#set working directory (adjust this for your own computer)

setwd("/Users/myom/Documents/Eastern/12_DTSC_560_DataScience_for_Business/module4")

#read dataset into R

milkdf <- read.csv("united_dairies.csv")

View(milkdf)

```

```{r}

#create a time series plot showing 12 weeks of milk sales

ggplot(data = milkdf, mapping = aes(x = Week, y = Sales)) +

geom_line () +

geom_point() +

scale_x_continuous(breaks = seq(0, 13, by = 1)) +

labs(title = "Weekly milk sales for United Dairies", x = "Week", y = "Sales")

```

```{r}

#create a separate vector for the actual weekly sales

sales_actuals<-milkdf$Sales

sales_actuals

```

[1] 2750 3100 3250 2800 2900 3050 3300 3100 2950 3000 3200 3150

```{r}

#use the naive method to forecast the 13th week of milk sales

naive13 <- c(NA, sales_actuals)

naive13

```

NA 2750 3100 3250 2800 2900 3050 3300 3100 2950 3000 3200 3150

```{r}

#The last value in the vector is the forecast for sales for the 13th week

#Create functions for the accuracy measures with vector of actual values

#and vector of predicted values as inputs

# mean absolute error

mae<-function(actual,pred){

mae <- mean(abs(actual-pred), na.rm=TRUE)

return (mae)

}

# mean square error

mse<-function(actual,pred){

mse <- mean((actual-pred)^2, na.rm=TRUE)

return (mse)

}

# Root-mean-square deviation

rmse<-function(actual,pred){

rmse <- sqrt(mean((actual-pred)^2, na.rm=TRUE))

return (rmse)

}

# Mean Absolute Percentage Error

mape<-function(actual,pred){

mape <- mean(abs((actual - pred)/actual), na.rm=TRUE)*100

return (mape)

}

```

```{r}

#Adjust the vector of predicted values to align with the sales_actuals vector

# removing last value

Naive_pred <- naive13[-length(naive13)]

```

```{r}

#Calculate accuracy measures with vector of actual values and vector

#of predicted values as inputs

mae(sales_actuals, Naive_pred)

mse(sales_actuals, Naive_pred)

rmse(sales_actuals, Naive_pred)

mape(sales_actuals, Naive_pred)

```

[1] 190.9091

[1] 50000

[1] 223.6068

[1] 6.269048

```{r}

# smoothing function. average over 3 weeks next three weeks

# use the simple moving average method to forecast the 13th week of milk sales

sma13<-SMA (sales_actuals, n=3)

sma13

```

NA NA 3033.333 3050.000 2983.333 2916.667 3083.333 3150.000 3116.667 3016.667 3050.000 3116.667

```{r}

#The last value in the vector is the forecast for sales for the 13th week

#Adjust the vector of predicted values to align with the sales_actuals vector

#Remove last value

sales_ma_pred<-c(NA, sma13[-length(sma13)])

sales_ma_pred

```

NA NA NA 3033.333 3050.000 2983.333 2916.667 3083.333 3150.000 3116.667 3016.667 3050.000

```{r}

#Calculate accuracy measures with vector of actual values and vector

#of predicted values as inputs

mae(sales_actuals, sales_ma_pred)

mse(sales_actuals, sales_ma_pred)

rmse(sales_actuals, sales_ma_pred)

mape(sales_actuals, sales_ma_pred)

```

[1] 161.1111

[1] 36203.7

[1] 190.2727

[1] 5.268628

```{r}

#use the exponential smoothing method with alpha = 0.2 to forecast the

#13th week of milk sales

exp13 <- EMA (sales_actuals, n=1, ratio = .2)

exp13

[1] 2750.000 2820.000 2906.000 2884.800 2887.840 2920.272 2996.218 3016.974 3003.579 3002.863 3042.291 3063.833

```{r}

#The last value in the vector is the forecast for sales for the 13th week

#Adjust the vector of predicted values to align with the sales_actuals vector

exp_pred <- c(NA, exp13[-length(exp13)])

exp_pred

```

NA 2750.000 2820.000 2906.000 2884.800 2887.840 2920.272 2996.218 3016.974 3003.579 3002.863 3042.291

```{r}

#Calculate accuracy measures with vector of actual values and vector

#of predicted values as inputs

mape(sales_actuals, exp_pred)

mae(sales_actuals, exp_pred)

mse(sales_actuals, exp_pred)

rmse(sales_actuals, exp_pred)

```

[1] 5.542897

[1] 174.7518

[1] 50462.68

[1] 224.639

```{r}

#use the exponential smoothing method with alpha = 0.4 to forecast the

#13th week of milk sales

# make smoothing little larger to ratio = 0.4

exp13_4 <- EMA (sales_actuals, n=1, ratio = .4)

exp13_4

```

[1] 2750.000 2890.000 3034.000 2940.400 2924.240 2974.544 3104.726 3102.836 3041.702 3025.021 3095.013 3117.008

```{r}

#The last value in the vector is the forecast for sales for the 13th week

#Adjust the vector of predicted values to align with the sales_actuals vector

exp_pred_4 <- c(NA, exp13_4[-length(exp13_4)])

```

```{r}

#Calculate accuracy measures with vector of actual values and vector

#of predicted values as inputs

mae(sales_actuals, exp_pred_4)

mse(sales_actuals, exp_pred_4)

rmse(sales_actuals, exp_pred_4)

mape(sales_actuals, exp_pred_4)

```

[1] 169.5315

[1] 44453.35

[1] 210.8396

[1] 5.4582

2023-0225 (SAT) Linear Regression

#install packages

install.packages ("tidyverse")

#load libraries

library(tidyverse)

#set working directory (adjust this for your own computer)

setwd("/Users/myom@cadent.tv/Documents/Eastern/12_DTSC_560_DataScience_for_Business")

#read dataset into R

cardf <- read.csv("ToyotaCorolla560.csv")

View(cardf)

#recode FuelType variable with 0 for Diesel and 1 for Petrol

cardf$FuelType<-ifelse(cardf$FuelType=="Petrol",1,0)

#Convert categorical variables to factors with levels and labels

cardf$FuelType<-factor(cardf$FuelType,levels = c(0,1),labels = c("Diesel","Petrol"))

cardf$MetColor<-factor(cardf$MetColor,levels = c(0,1),labels = c("No","Yes"))

cardf$Automatic<-factor(cardf$Automatic,levels = c(0,1),labels = c("No","Yes"))

#check for missing data

sum(is.na(cardf))

#generate summary statistics for all variables in dataframe

summary(cardf)

#create a scatterplot showing the relationship between Age and Price and add

#a regression line to the scatterplot

ggplot(data = cardf, mapping = aes(x = Age, y = Price)) +

geom_point() +

geom_smooth(method = "lm") +

labs(title = "Scatterplot for Car Age and Car Price", x = "Age", y = "Price")

#Calculate a correlation coefficient for the relationship between Age and Price

cor(cardf$Age, cardf$Price)

#Perform a simple linear regression with Car Age and Car Price

toyota_SLR <- lm(Price ~ Age, data = cardf)

#View the simple linear regression output

summary(toyota_SLR)

#create a correlation matrix with all quantitative variables in the dataframe

cor(cardf[c(1, 2, 3, 5)])

#turn off scientific notation for all variables

options(scipen=999)

#Perform a multiple regression with Car Age, KM, and Horsepower and Car Price

toyota_MR <- lm(Price ~ Age + KM + Horsepower, data = cardf)

#View the multiple regression output

summary(toyota_MR)

#install lm.beta package to extract standardized regression coefficients

install.packages ("lm.beta")

#load lm.beta

library(lm.beta)

#Extract standardized regression coefficients

lm.beta(toyota_MR)

#View the multiple regression output

summary(toyota_MR)

#Steps to create a scatterplot of residuals vs. predicted values of the

#dependent variable

#Create a vector of predicted values generated from the multiple

#regression above

toyota_pred = predict(toyota_MR)

#Create a vector of residuals generated from the multiple regression above

toyota_res = resid(toyota_MR)

#Create a data frame of the predicted values and the residuals

pred_res_df <- data.frame(toyota_pred, toyota_res)

#create a scatterplot of the residuals versus the predicted values

ggplot(data = pred_res_df, mapping = aes(x = toyota_pred, y = toyota_res)) +

geom_point() +

labs(title = "Plot of residuals vs. predicted values", x = "Predicted values",

y = "Residuals")

#Steps to create a Normal Probability Plot

#create a vector of standardized residuals generated from the multiple

#regression above

toyota_std.res = rstandard(toyota_MR)

#produce normal scores for the standardized residuals and create

#normal probability plot

qqnorm(toyota_std.res, ylab = "Standardized residuals", xlab = "Normal scores")

#install packages

install.packages ("car")

#load libraries

library(car)

#create a correlation matrix with all quantitative variables in the dataframe

cor(cardf[c(1, 2, 3, 5)])

#calculate Variance Inflation Factor for each variable to assess

#multicollinearity

vif(toyota_MR)

#Perform a multiple regression with Car Age, KM, Horsepower, FuelType,

#MetColor and Automatic as predictor variables and Car Price as the outcome

#variable

toyota_MR_Cat <- lm(Price ~ Age + KM + Horsepower + FuelType + MetColor + Automatic,

data = cardf)

#View multiple regression output

summary(toyota_MR_Cat)

#Steps to create a new scatterplot of residuals vs. predicted values of the

#dependent variable with the new categorical variables added

#Create a vector of predicted values generated from the multiple

#regression above

toyota_pred2 = predict(toyota_MR_Cat)

#Create a vector of residuals generated from the multiple regression above

toyota_res2 = resid(toyota_MR_Cat)

#Create a data frame of the predicted values and the residuals

pred_res_df2 <- data.frame(toyota_pred2, toyota_res2)

#create a scatterplot of the residuals versus the predicted values

ggplot(data = pred_res_df2, mapping = aes(x = toyota_pred2, y = toyota_res2)) +

geom_point() +

labs(title = "Plot of residuals vs. predicted values", x = "Predicted values",

y = "Residuals")

#Steps to create a Normal Probability Plot

#create a vector of standardized residuals generated from the multiple

#regression above

toyota_std.res = rstandard(toyota_MR)

#produce normal scores for the standardized residuals and create

#normal probability plot

qqnorm(toyota_std.res, ylab = "Standardized residuals", xlab = "Normal scores")

#partition the data into a training set and a validation set

#set seed so the random sample is reproducible

set.seed(42)

sample <- sample(c(TRUE, FALSE), nrow(cardf), replace=TRUE, prob=c(0.7,0.3))

traincar <- cardf[sample, ]

validatecar <- cardf[!sample, ]

#Install package needed for best subsets procedure

install.packages("olsrr")

#Load olsrr library

library(olsrr)

#run a multiple regression model using the "train" dataframe and all

#available independent variables

toyota_MR_Alltrain <- lm(Price ~ Age + KM + Horsepower + FuelType + MetColor +

Automatic, data = traincar)

summary(toyota_MR_Alltrain)

#run best subsets procedure with multiple regression output "toyota_MR_Alltrain"

bestsubset <- ols_step_all_possible(toyota_MR_Alltrain)

View(bestsubset)

#run a final multiple regression model using the "validate" dataframe

#and the following predictors: Age, KM, Horsepower, FuelType, and Automatic

toyota_MR_Val <- lm(Price ~ Age + KM + Horsepower + FuelType + Automatic,

data = validatecar)

summary(toyota_MR_Val)

#read inventory dataset into R

inventorydf <- read.csv("toyota_corolla_inventory.csv")

View(inventorydf)

#Convert categorical variables to factors with levels and labels

inventorydf$FuelType<-factor(inventorydf$FuelType,levels = c(0,1),labels = c("Diesel","Petrol"))

inventorydf$Automatic<-factor(inventorydf$Automatic,levels = c(0,1),labels = c("No","Yes"))

#estimate predicted y values and prediction intervals for five additional

#Toyota Corollas in inventory

predict(toyota_MR_Val, inventorydf, interval = "prediction", level = 0.95)

Page updated

Google Sites

Report abuse