plot3d(x=pca$x[,1],y=pca$x[,2],z=pca$x[,3], col = c("red","blue")[clinical$preeclampsia + 1], type="s",radius=3)
clinical = pe.getClinicalInfo()
plotTemplate = function(pca, plotTitle){
df <- cbind(clinical, as.data.frame(pca$x))
importance = summary(pca)$importance
label1 = importance[2] * 100
label2 = importance[5] * 100
ggplot(df, aes(x=PC1, y=PC2, color=clinical$ethnicityCategory, shape = as.factor(mixed))) +
geom_point(size = 3) +
scale_color_manual(values = brewer.pal(n = 8, name = "Set1"), labels = levels(clinical$ethnicityCategory), name = "Group") +
labs(title = plotTitle, x = glue("PC1 - {round(label1, 1)}%"), y = glue("PC2 - {round(label2, 1)}%")) +
scale_shape_manual(name = "Ethnicities", labels = c("Single", "Mixed"), values = c(16,17) )
}
pca = prcomp(pe.getNumericAlleleMatrix())
plotTemplate(pca, "QC: PCA of SNPs shows clustering by reported ethnicity, as expected")
plot3d(x=pca$x[,1],y=pca$x[,2],z=pca$x[,3], col = brewer.pal(n = 4, name = "Set1")[clinical$ethnicityCategory], type="s",radius=3)
clinical = pe.getClinicalInfo()
plotTemplateDiseaseEth = function(pca, plotTitle, clinicalData = pe.getClinicalInfo()) {
df <- cbind(clinicalData, as.data.frame(pca$x))
importance = summary(pca)$importance
label1 = importance[2] * 100
label2 = importance[5] * 100
ggplot(df, aes(x=PC1, y=PC2, color=as.factor(preeclampsia), shape = as.factor(mixed))) + geom_point(size = 3) +
scale_color_manual(values = brewer.pal(n = 8, name = "Set1"), labels = c("E.O. Preeclampsia", "Control"), name = "Group") +
scale_shape_manual(name = "Ethnicities", labels = c("Single", "Mixed"), values = c(16,17) ) +
# geom_text(aes(label=ifelse(PC1 > 50 & (PC2 > 50 | PC2 < -50), patientId, "")),hjust=-.5, vjust=-.5, size = 2.5, color = "black") +
labs(title = plotTitle, x = glue("PC1 - {round(label1, 1)}%"), y = glue("PC2 - {round(label2, 1)}%"))
}
domRecMatrix = pe.getNumericAlleleMatrix()
ress = lapply(levels(clinical$ethnicityCategory), function (name) {
groupIds = clinical$ethnicityCategory == name
pca = prcomp(domRecMatrix[groupIds,])
plotTemplateDiseaseEth(pca, glue("PCA of SNPs for {name} Ethnicity shows no clustering by disease"), clinical[groupIds,])
})
ress
[[1]]
[[2]]
[[3]]
[[4]]
source("setup.r")
[1] "Notice: cache already initialized. Keeping existing cache."
[conflicted] Removing existing preference
[conflicted] Will prefer dplyr::filter over any other package
qqPlot(pe.getPValsAll()$minPval, main = "QQPlot of p-values is consistent with uniform null distribution")
source("setup.r")
[1] "Notice: cache already initialized. Keeping existing cache."
[conflicted] Removing existing preference
[conflicted] Will prefer dplyr::filter over any other package
res = lapply(c("dominant", "codominant", "recessive", "overdominant", "log-additive"), function (modelName) {
cache.get(modelName, function () stop("Use thread.r to compute pvals.")) %>% as.data.frame %>% rownames_to_column(var = "snp") %>% arrange_at(3) %>% dplyr::slice(1:10) %>% select(-one_of("comments"))
})
res
[[1]]
snp dominant
1 rs7804641 2.271119e-05
2 rs30117 4.810937e-05
3 rs10482796 1.335995e-04
4 rs6604616 1.366858e-04
5 rs11774014 2.088724e-04
6 rs209549 2.138686e-04
7 rs4512191 2.138686e-04
8 rs1946016 2.169683e-04
9 rs1728232 2.341614e-04
10 rs4822422 2.341614e-04
[[2]]
snp codominant
1 chr5.32898472 2.831337e-05
2 chr5.32888767 3.405689e-05
3 chr5.32897461 4.118232e-05
4 chr5.32901811 4.118232e-05
5 chr5.32889877 9.170883e-05
6 chr5.32903017 9.170883e-05
7 rs209549 1.028533e-04
8 rs7804641 1.069225e-04
9 rs516451 1.222002e-04
10 rs6604616 1.366858e-04
[[3]]
snp recessive
1 chr5.32898472 4.742674e-06
2 chr5.32888767 1.148135e-05
3 chr5.32897461 1.719846e-05
4 chr5.32901811 1.719846e-05
5 chr5.32889877 4.953733e-05
6 chr5.32903017 4.953733e-05
7 rs1147227 4.953733e-05
8 rs516451 4.953733e-05
9 chr5.32884755 1.366858e-04
10 chr5.32890721 1.366858e-04
[[4]]
snp overdominant
1 rs7804641 6.020586e-05
2 chr11.48008018 9.513349e-05
3 rs817959 1.335995e-04
4 rs17773842 1.366858e-04
5 rs6604616 1.366858e-04
6 rs12518851 1.389169e-04
7 rs6061194 1.389896e-04
8 rs3104996 1.535729e-04
9 rs12502984 1.570357e-04
10 chr7.14684243 1.798556e-04
[[5]]
snp log-additive
1 rs209549 2.167011e-05
2 chr5.32897461 2.712220e-05
3 chr5.32901811 2.712220e-05
4 chr5.32888767 2.809238e-05
5 rs30117 3.956550e-05
6 chr5.32889877 4.689104e-05
7 chr5.32903017 4.689104e-05
8 rs516451 8.178124e-05
9 rs10482796 8.805029e-05
10 rs7804641 1.069225e-04
res = lapply(c("dominant", "codominant", "recessive", "overdominant", "log-additive"), function (modelName) {
Bonferroni.sig(cache.get(modelName, function () stop("Use thread.r to compute pvals.")), model = modelName, alpha = 0.05, include.all.SNPs=FALSE)
})
number of tests: 132667
alpha: 0.05
corrected alpha: 3.768835e-07
No significant SNPs after Bonferroni correction
number of tests: 132709
alpha: 0.05
corrected alpha: 3.767642e-07
No significant SNPs after Bonferroni correction
number of tests: 132692
alpha: 0.05
corrected alpha: 3.768125e-07
No significant SNPs after Bonferroni correction
number of tests: 132669
alpha: 0.05
corrected alpha: 3.768778e-07
No significant SNPs after Bonferroni correction
number of tests: 132697
alpha: 0.05
corrected alpha: 3.767983e-07
No significant SNPs after Bonferroni correction
source("setup.r")
[1] "Notice: cache already initialized. Keeping existing cache."
[conflicted] Removing existing preference
[conflicted] Will prefer dplyr::filter over any other package
clinicalRaw = read_csv(pe.clinicalDataPath, col_types = cols())
nrow(clinicalRaw)
[1] 109
head(clinicalRaw)
49 of the mothers were excluded due to lack of adequate maternal sample. Exclude these from clinical
clinical = pe.getClinicalInfo()
skim_tee(clinical)
Skim summary statistics
n obs: 60
n variables: 21
── Variable type:character ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
variable missing complete n min max empty n_unique
ethnicity1 0 60 60 6 12 0 11
ethnicity2 37 23 60 5 15 0 10
ethnicity3 44 16 60 7 19 0 8
── Variable type:factor ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
variable missing complete n n_unique
ethnicityCategory 0 60 60 4
top_counts ordered
Fil: 27, Cau: 19, Asi: 7, Pac: 7 FALSE
── Variable type:integer ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
variable missing complete n mean sd p0 p25 p50
3HrGlucose1HrGlucola 24 36 60 126.06 28.61 85 103 121.5
abruption 0 60 60 0.13 0.34 0 0 0
daughter 0 60 60 0.52 0.5 0 0 1
gdm 0 60 60 0.1 0.3 0 0 0
iugr 0 60 60 0.15 0.36 0 0 0
mixed 0 60 60 0.38 0.49 0 0 0
momAge 0 60 60 29.6 5.71 18 26 31
patientId 0 60 60 57.05 31.08 2 32.75 59.5
preeclampsia 0 60 60 0.52 0.5 0 0 1
primaGravida 0 60 60 0.35 0.48 0 0 0
p75 p100 hist
145.75 195 ▇▅▇▅▃▂▃▁
0 1 ▇▁▁▁▁▁▁▁
1 1 ▇▁▁▁▁▁▁▇
0 1 ▇▁▁▁▁▁▁▁
0 1 ▇▁▁▁▁▁▁▂
1 1 ▇▁▁▁▁▁▁▅
34 42 ▅▂▃▆▇▇▁▂
82.25 102 ▅▃▃▅▃▅▅▇
1 1 ▇▁▁▁▁▁▁▇
1 1 ▇▁▁▁▁▁▁▅
── Variable type:numeric ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
variable missing complete n mean sd p0 p25
babyWgtGram 0 60 60 2398.94 1013.13 730 1552.5
bmi 5 55 60 25.45 6.41 18.6 21.2
ethnicityPercent1 9 51 60 78.71 31.9 1 50
ethnicityPercent2 51 9 60 31.11 18.96 5 12.5
ethnicityPercent3 55 5 60 29.5 13.39 12.5 25
gestAge 0 60 60 35.66 4.04 27 32.88
placentaWgtGram 33 27 60 351.43 105.06 165 266.5
p50 p75 p100 hist
2386 3345.75 4193 ▅▆▇▃▂▅▇▃
23.2 26.95 48.5 ▇▅▂▂▁▁▁▁
100 100 100 ▁▁▁▃▁▁▁▇
25 50 50 ▂▃▁▃▁▁▁▇
25 37.5 47.5 ▃▁▇▁▁▃▁▃
34.75 39.4 41.2 ▂▂▂▆▃▁▇▆
366 424 533 ▃▃▂▃▇▂▃▃
clinicalNumeric = pe.getClinicalAsNumerics()
clinicalNumeric[1:5,]
In the plot below, we see only expected correlations.
corrplot(cor(clinicalNumeric), method="circle", type="upper")
Let’s check correlation of patients just as a sanity check. We should not see anything too extreme.
# todo: try different row/column removal strategies
corrplot(pe.getNumericAlleleMatrix() %>% t %>% cor, method="circle", type="upper")
source("setup.r")
[1] "Notice: cache already initialized. Keeping existing cache."
[conflicted] Removing existing preference
[conflicted] Will prefer dplyr::filter over any other package
tree = cache.get("phylotree", function() {
write_tsv(pe.getSimpleSNPFormatForPhyloTree(), "../output/simplesnp.tsv")
# Now run snphylo: http://chibba.pgml.uga.edu/snphylo/
# We assume it is stored in the ignore folder
system("../ignore/SNPhylo/snphylo.sh -s ../output/simplesnp.tsv -P ../output/snphylo")
read.tree("../output/snphylo.ml.tree")
})
clinical = pe.getClinicalInfo()
treePlot = clinical %>% mutate(Ethnicity = clinical$ethnicityCategory, Ethnicities = factor(mixed, labels = c("Single", "Mixed")))
treePlot$patientId = glue("Sample{treePlot$patientId}")
ggtree(tree, color = "grey") %<+% treePlot +
geom_tiplab(aes(color=Ethnicity, label = Ethnicity), show.legend = FALSE) +
geom_tippoint(size = 2, aes(color=Ethnicity, shape = Ethnicities), alpha=1) +
labs(title = "QC: Phylogenetic Tree from alignment with self-reported ethnicity") +
theme(legend.position="right")
source("setup.r")
[1] "Notice: cache already initialized. Keeping existing cache."
[conflicted] Removing existing preference
[conflicted] Will prefer dplyr::filter over any other package
library(Rtsne)
library(scatterplot3d)
library(rgl)
clinical = pe.getClinicalInfo()
domRecMatrixNoInvariants = pe.getNumericAlleleMatrix()
tsne <- Rtsne(domRecMatrixNoInvariants, dims = 2, max_iter = 500, perplexity = 3)
df = data.frame(d1 = tsne$Y[,1], d2 = tsne$Y[,2], preeclampsia = clinical$preeclampsia)
ggplot(df, aes(x=d1, y=d2, color=as.factor(preeclampsia))) + geom_point(size = 3) +
scale_color_manual(values = brewer.pal(n = 8, name = "Set1"), labels = c("E.O. Preeclampsia", "Control"), name = "Group") +
labs(title = "No clustering observed in tSNE")
# Let's try in 3D
tsne <- Rtsne(domRecMatrixNoInvariants, dims = 3, max_iter = 500, perplexity = 3)
scatterplot3d(x=tsne$Y[,1],y=tsne$Y[,2],z=tsne$Y[,3], color = c("red","blue")[clinical$preeclampsia + 1])
tsne <- Rtsne(domRecMatrixNoInvariants, dims = 3, max_iter = 500, perplexity = 3)
plot3d(x=tsne$Y[,1],y=tsne$Y[,2],z=tsne$Y[,3], col = c("red","blue")[clinical$preeclampsia + 1], type="s",radius=3)
tsne <- Rtsne(domRecMatrixNoInvariants, dims = 3, max_iter = 500, perplexity = 3)
plot3d(x=tsne$Y[,1],y=tsne$Y[,2],z=tsne$Y[,3], col = c("red","blue", "green", "orange")[as.integer(clinical$ethnicityCategory)], type="s",radius=3)
source("setup.r")
[1] "Notice: cache already initialized. Keeping existing cache."
[conflicted] Removing existing preference
[conflicted] Will prefer dplyr::filter over any other package
snpdata <- pe.getSnpData()
head(snpdata)
allele1 = summary(as.factor(snpdata$allele1))
allele2 = summary(as.factor(snpdata$allele2))
res = data.frame(baseCount = allele1, allele = "allele1", base = names(allele1))
res2 = data.frame(baseCount = allele2, allele = "allele2", base = names(allele2))
sums = rbind(res, res2)
ggplot(sums, aes(x = base, fill = allele, y = baseCount)) + geom_bar(position = "dodge", stat = "sum") + labs(title = "Nucleotide Occurrence in SNP dataset") + scale_y_continuous(labels = comma)
sums
bySample1 = snpdata %>% select(sampleId, allele1) %>% group_by(sampleId, allele1) %>% summarise(n = n())
ggplot(bySample1, aes(x = sampleId, y = n, color = allele1)) + geom_point()
ggplot(bySample1, aes(x = allele1, y = n, color = allele1)) + geom_boxplot()
bySample2 = snpdata %>% select(sampleId, allele2) %>% group_by(sampleId, allele2) %>% summarise(n = n())
ggplot(bySample2, aes(x = sampleId, y = n, color = allele2)) + geom_point()
ggplot(bySample2, aes(x = allele2, y = n, color = allele2)) + geom_boxplot()
All of the outliers are with patient 78. Let’s see the avg gen call score by patient:
# There are 8 values with a NaN for gencall score. We can set those to 0.0 because they are all uncalled bases ("--")..
snpdata[is.na(snpdata$avgGenCallScore),]["avgGenCallScore"] = 0
scoresByPatient = snpdata %>% select(sampleId, avgGenCallScore) %>% group_by(sampleId) %>% summarise(average = mean(avgGenCallScore))
summary(scoresByPatient$average)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.7245 0.7798 0.7892 0.7844 0.7903 0.7909
scoresByPatient$preeclampsia = clinical %>% arrange(patientId) %>% select(preeclampsia) %>% unlist %>% as.factor
ggplot(scoresByPatient, aes(y = average, x = sampleId, color = preeclampsia)) + geom_point() + geom_label(aes(label = sampleId))
a1 = snpdata %>% group_by(allele1, sampleId) %>% summarise(count = n()) %>% spread(allele1, count)
a2 = snpdata %>% group_by(allele2, sampleId) %>% summarise(count = n()) %>% spread(allele2, count) %>% select(-sampleId)
colnames(a2) = c("-2", "A2", "C2", "G2", "T2")
aBoth= cbind(a1, a2)
aBoth$preeclampsia = clinical$preeclampsia
aBoth = aBoth %>% select(-sampleId) %>% group_by(preeclampsia) %>% summarise_all(funs(mean))
aBoth
aBoth2 = t(aBoth)
colnames(aBoth2) = c("control", "preeclampsia")
aBoth2 = as.data.frame(aBoth2)
aBoth2 = aBoth2[-1,]
aBoth2$base = unlist(rownames(aBoth2))
aBoth2 = aBoth2 %>% gather('control', 'preeclampsia', key = 'group', value = 'counts')
ggplot(aBoth2, aes(x = base, fill = group, y = counts)) + geom_bar(position = "dodge", stat = "sum") + labs(title = "Nucleotide Occurrence by Case/Control Group") + scale_y_continuous(labels = comma)
simpleSNP = snpdata
simpleSNP$avgGenCallScore = NULL
simpleSNP = simpleSNP %>% unite(alleles, c(allele1, allele2), sep = "")
colnames(simpleSNP) = c("location", "sample", "alleles")
simpleSNP = simpleSNP %>% spread(sample, alleles, sep = "")
head(simpleSNP)
countOfVariantsPerLocation <- apply(simpleSNP[,-1], 1, function(x)length(unique(x)))
histogram(countOfVariantsPerLocation, col=brewer.pal(n = 3, name = "Set1")[2])
data.frame(countOfVariantsPerLocation = countOfVariantsPerLocation) %>% group_by(countOfVariantsPerLocation) %>% summarise(n=n())
duplicates = pe.getDuplicateMeasurementLocations()
duplicates
fullSet = pe.getSnpsWithChipInfo()
head(fullSet)
dupeMeasurements = fullSet %>% select(sampleId, allele1, allele2, Chr, MapInfo) %>% mutate(id = paste(Chr, MapInfo)) %>% filter(id %in% duplicates$id)
dupeMeasurements[1:5,]
There is variation at these SNPs. We’ll need to think about how to handle
phyloSnp = pe.getSimpleSNPFormatForPhyloTree()
chrData = phyloSnp %>% group_by(`#Chrom`) %>% summarise(loci = n()) %>% mutate(Chr = 1:25) %>% arrange(Chr)
ggplot(chrData, aes(x = Chr, y = loci, label = `#Chrom`)) + geom_col() + geom_label()
chrData
We confirm there are no Y chromosome measurements made.
alleleCounts = snpdata %>% select(illuminaSnpLocation, allele1) %>% group_by(illuminaSnpLocation) %>% summarise(numberOfAlleles = length(unique(allele1)))
alleleCounts$snpIndex = seq.int(nrow(alleleCounts))
ggplot(alleleCounts, aes(x = snpIndex, y = numberOfAlleles)) + geom_hex( bins = 30) + scale_y_continuous(breaks = c(1,2,3)) + labs(title = "Density of SNP variation for Allele 1")
alleleCounts = snpdata %>% select(illuminaSnpLocation, allele2) %>% group_by(illuminaSnpLocation) %>% summarise(numberOfAlleles = length(unique(allele2)))
alleleCounts$snpIndex = seq.int(nrow(alleleCounts))
ggplot(alleleCounts, aes(x = snpIndex, y = numberOfAlleles)) + geom_hex( bins = 30) + scale_y_continuous(breaks = c(1,2,3)) + labs(title = "Density of SNP variation for Allele 2")
alleles1 = snpdata %>% mutate(allele = allele1) %>% select(illuminaSnpLocation, allele) %>%
group_by(illuminaSnpLocation) %>% mutate(allele = paste(unique(allele), collapse = "")) %>%
group_by(allele) %>% summarise(count = n()) %>% mutate(len = str_length(allele), count = count/60) %>%
arrange(desc(len)) %>% select(count, allele)
alleles2 = snpdata %>% mutate(allele = allele2) %>% select(illuminaSnpLocation, allele) %>%
group_by(illuminaSnpLocation) %>% mutate(allele = paste(unique(allele), collapse = "")) %>%
group_by(allele) %>% summarise(count = n()) %>% mutate(len = str_length(allele), count = count/60) %>%
arrange(desc(len)) %>% select(count, allele)
both = cbind(alleles1, alleles2[-2])
colnames(both) = c("allele1count", "nucleotides", "allele2count")
both
source("setup.r")
[1] "Notice: cache already initialized. Keeping existing cache."
[conflicted] Removing existing preference
[conflicted] Will prefer dplyr::filter over any other package
The data comes from the following:
Illumina
Descriptor File Name: Cardio-Metabo_Chip_11395247_A.bpm
Assay Format: Infinium HD Ultra
Date Manufactured: 9/8/2009
Loci Count: 196725
# Let's look at the raw illumina data about this metabochip
file.head(pe.illuminaPath, 10)
Illumina, Inc.,,,,,,,,,,,,,,,,,
[Heading],,,,,,,,,,,,,,,,,,
Descriptor File Name,Cardio-Metabo_Chip_11395247_A.bpm,,,,,,,,,,,,,,,,,
Assay Format,Infinium HD Ultra,,,,,,,,,,,,,,,,,
Date Manufactured,9/8/2009,,,,,,,,,,,,,,,,,
Loci Count ,196725,,,,,,,,,,,,,,,,,
[Assay],,,,,,,,,,,,,,,,,,
IlmnID,Name,IlmnStrand,SNP,AddressA_ID,AlleleA_ProbeSeq,AddressB_ID,Allele
chr1:109457160-0_B_R_1602772855,chr1:109457160,BOT,[T/G],41769450,AGCCTTTT
chr1:109457233-0_B_F_1602777793,chr1:109457233,BOT,[T/G],11615356,CCTTCTCC
metaboChip = pe.getMetaboChip()
head(metaboChip)
skim_tee(metaboChip)
Skim summary statistics
n obs: 196725
n variables: 19
── Variable type:character ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
variable missing complete n min max empty n_unique
AlleleA_ProbeSeq 0 196725 196725 50 50 0 196701
AlleleB_ProbeSeq 173712 23013 196725 50 50 0 23009
IlmnID 0 196725 196725 23 36 0 196725
IlmnStrand 0 196725 196725 3 3 0 2
Name 0 196725 196725 5 17 0 196725
Ploidy 0 196725 196725 7 7 0 1
SNP 0 196725 196725 5 5 0 10
Source 0 196725 196725 7 23 0 7
SourceSeq 0 196725 196725 59 125 0 196711
SourceStrand 0 196725 196725 3 3 0 2
Species 0 196725 196725 12 12 0 1
TopGenomicSeq 0 196725 196725 59 125 0 196705
── Variable type:factor ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
variable missing complete n n_unique
Chr 0 196725 196725 26
top_counts ordered
6: 21966, 1: 18720, 11: 16108, 2: 15750 FALSE
── Variable type:integer ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
variable missing complete n mean sd p0
AddressA_ID 0 196725 196725 4e+07 1.8e+07 1.1e+07
AddressB_ID 173712 23013 196725 4.2e+07 1.9e+07 1.1e+07
BeadSetID 0 196725 196725 224.28 0.96 223
GenomeBuild 0 196725 196725 35.88 1.44 18
MapInfo 0 196725 196725 7.3e+07 5.5e+07 923
SourceVersion 0 196725 196725 52.49 71.59 0
p25 p50 p75 p100 hist
2.5e+07 3.9e+07 5.5e+07 7.5e+07 ▇▇▇▇▇▆▅▆
2.6e+07 4.2e+07 5.8e+07 7.5e+07 ▇▇▇▇▇▇▇▇
223 225 225 225 ▅▁▁▁▁▁▁▇
36 36 36 92 ▁▇▁▁▁▁▁▁
2.8e+07 6e+07 1.1e+08 2.5e+08 ▇▆▅▅▂▂▁▁
0 0 128 978 ▇▅▁▁▁▁▁▁
source("setup.r")
[1] "Notice: cache already initialized. Keeping existing cache."
[conflicted] Removing existing preference
[conflicted] Will prefer dplyr::filter over any other package
library(xgboost)
library(caTools)
conflict_prefer("union", "dplyr")
[conflicted] Will prefer dplyr::union over any other package
conflict_prefer("colnames", "base")
[conflicted] Will prefer base::colnames over any other package
conflict_prefer("as.numeric", "base")
[conflicted] Will prefer base::as.numeric over any other package
clinical = pe.getClinicalInfo()
domRecMatrix = pe.getNumericAlleleMatrix()
domRecMatrix[1:5,1:5]
chr1_156843624 chr1_156843733 chr1_156850715 chr1_156853992
[1,] 1 -1 1 1
[2,] 1 1 0 1
[3,] 1 1 0 1
[4,] 1 1 0 1
[5,] 1 1 1 1
chr1_156854468
[1,] 1
[2,] 1
[3,] 1
[4,] 1
[5,] 1
draw_confusion_matrix <- function(cm) {
total = sum(cm$table)
res <- as.numeric(cm$table)
# Generate color gradients. Palettes come from RColorBrewer.
greenPalette = c("#F7FCF5","#E5F5E0","#C7E9C0","#A1D99B","#74C476","#41AB5D","#238B45","#006D2C","#00441B")
redPalette = c("#FFF5F0","#FEE0D2","#FCBBA1","#FC9272","#FB6A4A","#EF3B2C","#CB181D","#A50F15","#67000D")
getColor = function (greenOrRed = "green", amount = 0) {
if (amount == 0)
return("#FFFFFF")
palette = greenPalette
if (greenOrRed == "red")
palette = redPalette
colorRampPalette(palette)(100)[10 + ceiling(90 * amount / total)]
}
# set the basic layout
layout(matrix(c(1,1,2)))
par(mar=c(2,2,2,2))
plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
title('CONFUSION MATRIX', cex.main=2)
# create the matrix
classes = colnames(cm$table)
rect(150, 430, 240, 370, col=getColor("green", res[1]))
text(195, 435, classes[1], cex=1.2)
rect(250, 430, 340, 370, col=getColor("red", res[3]))
text(295, 435, classes[2], cex=1.2)
text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
text(245, 450, 'Actual', cex=1.3, font=2)
rect(150, 305, 240, 365, col=getColor("red", res[2]))
rect(250, 305, 340, 365, col=getColor("green", res[4]))
text(140, 400, classes[1], cex=1.2, srt=90)
text(140, 335, classes[2], cex=1.2, srt=90)
# add in the cm results
text(195, 400, res[1], cex=1.6, font=2, col='white')
text(195, 335, res[2], cex=1.6, font=2, col='white')
text(295, 400, res[3], cex=1.6, font=2, col='white')
text(295, 335, res[4], cex=1.6, font=2, col='white')
# add in the specifics
plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
# add in the accuracy information
text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}
combined = cbind(clinical %>% select(preeclampsia), domRecMatrix)
combined$preeclampsia = as.factor(combined$preeclampsia)
split <- sample.split(clinical$preeclampsia, SplitRatio = 0.8)
training_set = subset(combined, split == TRUE)
validation_set = subset(combined, split == FALSE)
dTrain <- xgb.DMatrix(as.matrix(training_set[-1]), label = as.matrix(training_set[1]))
dValidation <- xgb.DMatrix(as.matrix(validation_set[-1]), label = as.matrix(validation_set[1]))
dwatchlist <- list(eval = dValidation, train = dTrain)
params <- list(eta = 0.1, max_depth = 2, objective = "binary:logistic")
classifier <- xgb.train(data = dTrain, params = params, save_period = 100, watchlist = dwatchlist,
label = training_set[1], early_stopping_rounds = 5, nrounds = 50)
[1] eval-error:0.500000 train-error:0.020833
Multiple eval metrics are present. Will use train_error for early stopping.
Will train until train_error hasn't improved in 5 rounds.
[2] eval-error:0.666667 train-error:0.020833
[3] eval-error:0.416667 train-error:0.020833
[4] eval-error:0.666667 train-error:0.000000
[5] eval-error:0.666667 train-error:0.000000
[6] eval-error:0.583333 train-error:0.000000
[7] eval-error:0.666667 train-error:0.000000
[8] eval-error:0.500000 train-error:0.000000
[9] eval-error:0.583333 train-error:0.000000
Stopping. Best iteration:
[4] eval-error:0.666667 train-error:0.000000
y_pred <- predict(classifier, newdata = as.matrix(validation_set[-1])) %>% round
actual = as.data.frame(validation_set)[, 1]
importance = xgb.importance(colnames(combined)[-1], model = classifier)
ggplot(importance[1:10,], aes(x = Feature, y = Gain)) + geom_col()
cm = caret::confusionMatrix(factor(y_pred), factor(actual))
draw_confusion_matrix(cm)
print(importance[1:10,])
Feature Gain Cover Frequency
1: rs2652492 0.34887673 0.33553125 0.22222222
2: rs4736440 0.14551246 0.16446875 0.11111111
3: rs6034593 0.09757756 0.08659054 0.11111111
4: chr1:119340228 0.07513690 0.06663553 0.07407407
5: rs12193094 0.05426129 0.02975212 0.03703704
6: chr13:109780171 0.04842820 0.02975212 0.03703704
7: chr1:119258927 0.03764861 0.02956836 0.03703704
8: rs2039465 0.03372908 0.03306453 0.03703704
9: chr8:19788876 0.03069535 0.02866063 0.03703704
10: rs13078384 0.02808678 0.02793489 0.03703704
cm
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 1 3
1 5 3
Accuracy : 0.3333
95% CI : (0.0992, 0.6511)
No Information Rate : 0.5
P-Value [Acc > NIR] : 0.9270
Kappa : -0.3333
Mcnemar's Test P-Value : 0.7237
Sensitivity : 0.16667
Specificity : 0.50000
Pos Pred Value : 0.25000
Neg Pred Value : 0.37500
Prevalence : 0.50000
Detection Rate : 0.08333
Detection Prevalence : 0.33333
Balanced Accuracy : 0.33333
'Positive' Class : 0
clinical = pe.getClinicalInfo()
domRecMatrix = pe.getNumericAlleleMatrix()
params <- list(eta = 0.02, max_depth = 2, objective = "binary:logistic")
booster = xgb.cv(params, domRecMatrix, 30, nfold = 5, label = clinical$preeclampsia)
[1] train-error:0.062507+0.022822 test-error:0.568532+0.105236
[2] train-error:0.041582+0.034912 test-error:0.553147+0.102857
[3] train-error:0.045837+0.030617 test-error:0.523660+0.136221
[4] train-error:0.041582+0.034912 test-error:0.522378+0.123573
[5] train-error:0.037415+0.035883 test-error:0.539044+0.113392
[6] train-error:0.033248+0.038668 test-error:0.522378+0.123573
[7] train-error:0.033248+0.038668 test-error:0.555711+0.112550
[8] train-error:0.033248+0.038668 test-error:0.522378+0.123573
[9] train-error:0.033248+0.038668 test-error:0.555711+0.112550
[10] train-error:0.020748+0.018635 test-error:0.539044+0.113392
[11] train-error:0.016582+0.015569 test-error:0.539044+0.113392
[12] train-error:0.016582+0.015569 test-error:0.539044+0.113392
[13] train-error:0.012415+0.016625 test-error:0.539044+0.113392
[14] train-error:0.012415+0.016625 test-error:0.539044+0.113392
[15] train-error:0.012415+0.016625 test-error:0.539044+0.113392
[16] train-error:0.008248+0.010103 test-error:0.555711+0.112550
[17] train-error:0.008248+0.010103 test-error:0.572378+0.121241
[18] train-error:0.008248+0.010103 test-error:0.572378+0.121241
[19] train-error:0.008248+0.010103 test-error:0.572378+0.121241
[20] train-error:0.004082+0.008163 test-error:0.589044+0.115749
[21] train-error:0.004082+0.008163 test-error:0.572378+0.121241
[22] train-error:0.004082+0.008163 test-error:0.555711+0.112550
[23] train-error:0.004082+0.008163 test-error:0.572377+0.109186
[24] train-error:0.004082+0.008163 test-error:0.572378+0.121241
[25] train-error:0.004082+0.008163 test-error:0.589044+0.115749
[26] train-error:0.004082+0.008163 test-error:0.572378+0.121241
[27] train-error:0.004082+0.008163 test-error:0.589044+0.115749
[28] train-error:0.004082+0.008163 test-error:0.572378+0.121241
[29] train-error:0.004082+0.008163 test-error:0.589044+0.115749
[30] train-error:0.004082+0.008163 test-error:0.589044+0.115749
booster$evaluation_log[booster$best_iteration]$test_error_mean
NULL
clinicalNumeric = as.matrix(pe.getClinicalAsNumerics())
training_set = subset(clinicalNumeric, split == TRUE)
validation_set = subset(clinicalNumeric, split == FALSE)
dTrain <- xgb.DMatrix(training_set[,-1], label = training_set[,1])
params <- list(eta = 0.3, max_depth = 1, objective = "binary:logistic")
classifier <- xgb.train(params, dTrain, 20)
y_pred <- predict(classifier, newdata = validation_set[,-1]) %>% round
actual = as.data.frame(validation_set)[, 1]
importance = xgb.importance(colnames(clinicalNumeric)[-1], model = classifier)
ggplot(importance[1:10,], aes(x = Feature, y = Gain)) + geom_col()
Warning: Removed 9 rows containing missing values (position_stack).
cm = caret::confusionMatrix(factor(y_pred), factor(actual))
draw_confusion_matrix(cm)
print(importance[1:10,])
Feature Gain Cover Frequency
1: gestAge 1 1 1
2: <NA> NA NA NA
3: <NA> NA NA NA
4: <NA> NA NA NA
5: <NA> NA NA NA
6: <NA> NA NA NA
7: <NA> NA NA NA
8: <NA> NA NA NA
9: <NA> NA NA NA
10: <NA> NA NA NA
cm
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 6 0
1 0 6
Accuracy : 1
95% CI : (0.7354, 1)
No Information Rate : 0.5
P-Value [Acc > NIR] : 0.0002441
Kappa : 1
Mcnemar's Test P-Value : NA
Sensitivity : 1.0
Specificity : 1.0
Pos Pred Value : 1.0
Neg Pred Value : 1.0
Prevalence : 0.5
Detection Rate : 0.5
Detection Prevalence : 0.5
Balanced Accuracy : 1.0
'Positive' Class : 0
# rmarkdown::render('main.rmd')