## Listing 1 www = "http://www.rauldicello.com/raul.csv" raul = read.csv(www, header = TRUE) attach(raul) head(raul) tail(raul) shapiro.test(AGE[Outcome == "benignant"])$p.value shapiro.test(AGE[Outcome == "malignant"])$p.value wilcox.test(AGE ~ Outcome) shapiro.test(CA125[Outcome == "benignant"])$p.value shapiro.test(CA125[Outcome == "malignant"])$p.value wilcox.test(CA125 ~ Outcome) shapiro.test(LDH1[Outcome == "benignant"])$p.value shapiro.test(LDH1[Outcome == "malignant"])$p.value wilcox.test(LDH1 ~ Outcome) shapiro.test(LDH3[Outcome == "benignant"])$p.value shapiro.test(LDH3[Outcome == "malignant"])$p.value wilcox.test(LDH3 ~ Outcome) fisher.test(table(CenV, Outcome)) fisher.test(table(PerV, Outcome)) ## Listing 2 library(party) tree01 = ctree(Outcome ~ AGE + CA125) plot(tree01, main = "conditional regression tree (AGE, CA125)") tree02 = ctree(Outcome ~ PerV) plot(tree02, main = "conditional regression tree (Peripheral Vascularization)") tree03 = ctree(Outcome ~ LDH1 + LDH3 + CenV) plot(tree03, main = "conditional regression tree (LDH1, LDH3, Central Vascularization)") ## Listing 3 model01 = glm(Outcome ~ AGE + CA125, family = binomial) AIC(model01) # 323 model02 = glm(Outcome ~ PerV, family = binomial) AIC(model02) # 438 model03 = glm(Outcome ~ LDH1 + LDH3 + CenV, family = binomial) AIC(model03) # 28 tree00 = ctree(Outcome ~ AGE + CA125 + LDH1 + LDH3 + CenV + PerV) plot(tree00, main = "all covariates conditional regression tree") ## Listing 4 library(ggplot2) ggplot(raul, aes(x= LDH1, y= LDH3)) + geom_point(alpha = 0.3, position = position_jitter(width = 0.35, height = 0.35)) + geom_smooth(method=lm) modelA = lm(LDH3 ~ LDH1) summary(modelA) cor(LDH1, LDH3, use = "complete.obs") par(mfrow = c(1,2)) boxplot(LDH3 ~ Outcome, main = "LDH3") boxplot(LDH1 ~ Outcome, main = "LDH1") ## Listing 5 Score1 = LDH3 + 1/LDH1 Lesion = factor(3 - as.numeric(Outcome)) levels(Lesion)[1] = "Malignant" levels(Lesion)[2] = "Benignant" ggplot(raul, aes(x= LDH3, y= Score1, shape = Lesion, color = Lesion)) + geom_point(alpha = 0.3, position = position_jitter(width = 2, height = 0.5)) ## Listing 6 for (alpha in 1:32){ ScoreAlpha = LDH3 + alpha/LDH1 albero = ctree(Outcome ~ ScoreAlpha + CenV) print(albero)} ## Listing 7 library(ROCR) roccurve = prediction( Raul, Outcome ) (areaunder = performance(roccurve, "auc")@y.values) # 0.999 (contingency = table(Raul <= 29, Outcome)) contingency[2] / (contingency[1]+contingency[2]) # spec = 0.9959 contingency[3] / (contingency[1]+contingency[3]) # negpredv = 0.8269 (contingency = table(CenV[Raul > 29], Outcome[Raul > 29])) contingency[4] / (contingency[3]+contingency[4]) # sens = 0.9767 contingency[4] / (contingency[2]+contingency[4]) # pospredv = 0.9767 contingency[1] / (contingency[1]+contingency[2]) # 0.8889 ## Listing 8 x = seq(from = 0.9999, to = 0.8000, by = -0.0001) y = 1 - pbinom(42, 43, x) x[which( y < 0.01 )[1]] # 0.8984 ## Listing 9 ptm = proc.time() set.seed(1234) # for replicability NN = 10000 validation = matrix(rep(NA, 3 * NN), nrow = NN, ncol = 3) validation for (i in 1:NN){ extractThousand = sample(1:2750, size = 1000, replace = TRUE) sub = raul[extractThousand,] sOutcome = sub$Outcome sRaul = sub$LDH3 + 24/sub$LDH1 sAGE = sub$AGE sCA125 = sub$CA125 sLDH1 = sub$LDH1 sCenV = sub$CenV sPerV = sub$PerV sLDH3 = sub$LDH3 stree = ctree(sOutcome ~ sRaul + sAGE + sCA125 + sLDH1 + sCenV + sPerV + sLDH3, data = sub) vstree = c(stree@tree$psplit) validation[i, 1] = vstree$variableName # validation[i, 2] = vstree$splitpoint validation[i, 3] = sum(sOutcome == "malignant") } proc.time() - ptm # user system elapsed #148.547 1.377 148.515 # write.csv(validation, file = "validationobtained.csv") validationobtained = read.csv(file.choose(), header = TRUE) attach(validationobtained) head(validationobtained) tail(validationobtained) table(predictor) boxplot(howmanymalgn ~ predictor, varwidth = TRUE) detach(validationobtained) rm(validationobtained) ## Listing 10 ptm = proc.time() set.seed(1234) # for replicability NN = 10000 answer = matrix(rep(NA, 3 * NN), nrow = NN, ncol = 3) answer for (i in 1:NN){ extractThousand = sample(1:2750, size = 1000, replace = TRUE) sub = raul[extractThousand,] sOutcome = sub$Outcome sRaul = sub$LDH3 + 24/sub$LDH1 stree = ctree(sOutcome ~ sRaul, data = sub) vstree = c(stree@tree$psplit) answer[i, 1] = vstree$variableName answer[i, 2] = vstree$splitpoint answer[i, 3] = sum(sub$Outcome == "malignant") } answer proc.time() - ptm # user system elapsed # 85.954 0.850 86.131 # write.csv(answer, file = "answerobtained.csv") answerobtained = read.csv(file.choose(), header = TRUE) attach(answerobtained) head(answerobtained) tail(answerobtained) boxplot(cutoff) quantile(cutoff, c(0.025, 0.975)) # 95% c.i. # 2.5% 97.5% # 28.8 30.1