【学习笔记】Foundations of Strategic Business Analytics

WEEK 1 Finding groups in data
testdata = scale (testdata)
d = dist (testdata, method = "euclidean")
hcward = hclust (d, method = "ward.D")
data$groups <- cutree (hcward, k=3) #data中新加一列groups,保存每个observation的组数
install.packages("lattice")
library(lattice)
xyplot(ADS ~ CV,main = "After Clustering", type="p",group=groups,data=data, # define the groups to be differentiated
auto.key=list(title="Group", space = "left", cex=1.0, just = 0.95), # to produce the legend we use the auto.key= list()
par.settings = list(superpose.line=list(pch = 0:18, cex=1)), # the par.settings argument allows us to pass a list of display settings
col=c('blue','green','red')) # finally we choose the colour of our plotted points per group
aggdata = aggregate(.~ groups, data=data, FUN=mean) # The aggregate() function presents a summary of a statistic, broken down by one or more groups. Here we compute the mean of each variable for each group.
proptemp = aggregate (S~ groups, data=ds, FUN= length)
aggdata$proportion = (proptemp$S)/sum(proptemp$S)
aggdata=aggdata[order(aggdata$proportion,decreasing=T),]
View(aggdata)
write.csv(aggdata, "HR_example_Numerical_Output.csv", row.names=FALSE)
palette(rainbow(12, s = 0.6, v = 0.75)) # Select the colors to use
stars(aggdata[,2:(ncol(data))], len = 0.6, key.loc = c(11, 6),xlim=c(2,12),main = "Segments", draw.segments = TRUE,nrow = 2, cex = .75,labels=aggdata$groups)


WEEK 2 Factors leading to events



WEEK 3 
#estimate a linear regression model
linreg = lm(Rating ~., data = olddata)
#obtain predictions made by the model on newdata, 在newdata上做预测是预测,在建模的data上做预测是拟合。
predcreditscore = predict(linreg, newdata = datanew, type = "response")
#检查拟合值和建模dataset的实际值的相关度
cor(lingre$fitted.values, dataold$Rating)
#检查预测值和test set的实际值的相关度
cor(precreditscore, datanew$Rating)
#logistic regression model预测分类,family是概率分布,分布族是binormal,默认连接函数是logit
logreg = glm(left ~., family = binormal(logit), data = dataold)
#assess that the model is performing well, then use it to make predictions for out-of-sample data
probaToLeave = predict(logreg, newdata = datanew, type = "response")
#structure the prediction output in a data frame
predattrition = data.frame(probaToLeave)
#add a column to the preadttrition data frame containting the performance (LPE: last project evaluation),生成了一个关于LPE和probaToLeave关系的表
predattrition$performance = datanew$LPE
#establish a priority score by multiplying performance and probability to leave for each employee
prebattrition$priority = prebattrition$performance * preattrition$probaToLeave
#order the employeees by priority
orderprebattrition = prebattrition[order(preattrition$priority, decreasing = TRUE),]
linregmodel = lm(lifetime ~.-broken, data = data)
#Step 1: set dependent variables
dependantvars = Surv(data$lifetime, data$broken)
#step 2: build a regression model
survreg = survreg(dependantvars ~ ...+...+..., dsit = "gaussian", data = data)
#step 3: assess the model on out of sample data,不能使用不保证准确的模型,或者不知道limits的模型。
#step 4: using the model to estimate the remaining lifetime of machines, which are not currently broken.返回lifetime的期望(p = .5)
Ebreak = predict(survreg, newdata = data, type = "quantile", p = .5)
#step 5: build the dataframe
forecast = data.frame(Ebreak)
forecast$lifetime = data$lifetime
forecast$remainingLT = forecast$Ebreak - data$lifetime
#step 6: optimize the output
forecast = forecast[order(forecast$remainingLT),]
actionpriority = forecast[forecast$broken = 0,]
#step 1: plot the sales as a function of time to get a better intrition of data (shows seasonality), ylm argument is used to set a specific limit to the y axis wider than the default by multiply the max value.
plot(data$time, data$sales, ylm = c(0, max(data$sales * 1.2)), type = "l")
#step 2: build a linear regression model.
regres = lm(sales ~ month, data = data)
#step 3: create plot to see the distribution of sales for each month. The month variable is a factor, so R will automatically represent the sales per month using box plot.
plot(data$month, data$sales, ylm = c(0, max(data$sales * 1.2)))
#step 4: test the model on past data. Plot the actual sales and add the sales as provided by the model (in blue) in the same plot, then add a legend.
plot(data$time, data$sales, ylm = c(0, max(data$sales * 1.2)), type = "l")
lines(data$time, regres$fitted.values, type = "l", col = "blue", lty = 2) --dash line
legend("topleft", c("Actual sales", "Sales by the model"), lty = c(1, 2), col = c("black", "blue"))



WEEK 4 Recommendation production and prioritization