We can also try a 3D PCA:

plot3d(x=pca$x[,1],y=pca$x[,2],z=pca$x[,3], col = c("red","blue")[clinical$preeclampsia + 1], type="s",radius=3)

Nothing jumps out. Let’s double check ethnicities, as we should see clusters from SNPS there.

clinical = pe.getClinicalInfo()

plotTemplate = function(pca, plotTitle){
  df <- cbind(clinical, as.data.frame(pca$x))
  importance = summary(pca)$importance
  label1 = importance[2] * 100
  label2 = importance[5] * 100

  ggplot(df, aes(x=PC1, y=PC2, color=clinical$ethnicityCategory, shape = as.factor(mixed))) +
  geom_point(size = 3) +
  scale_color_manual(values = brewer.pal(n = 8, name = "Set1"), labels = levels(clinical$ethnicityCategory), name = "Group") +
  labs(title = plotTitle, x = glue("PC1 - {round(label1, 1)}%"), y = glue("PC2 - {round(label2, 1)}%")) +
  scale_shape_manual(name = "Ethnicities", labels = c("Single", "Mixed"), values = c(16,17) )
}

pca = prcomp(pe.getNumericAlleleMatrix())
plotTemplate(pca, "QC: PCA of SNPs shows clustering by reported ethnicity, as expected")

Let’s do a 3D plot of ethnicity

plot3d(x=pca$x[,1],y=pca$x[,2],z=pca$x[,3], col = brewer.pal(n = 4, name = "Set1")[clinical$ethnicityCategory], type="s",radius=3)
clinical = pe.getClinicalInfo()

plotTemplateDiseaseEth = function(pca, plotTitle, clinicalData = pe.getClinicalInfo()) {
  df <- cbind(clinicalData, as.data.frame(pca$x))
  importance = summary(pca)$importance
  label1 = importance[2] * 100
  label2 = importance[5] * 100
  ggplot(df, aes(x=PC1, y=PC2, color=as.factor(preeclampsia),  shape = as.factor(mixed))) + geom_point(size = 3) +
  scale_color_manual(values = brewer.pal(n = 8, name = "Set1"), labels = c("E.O. Preeclampsia", "Control"), name = "Group") +
  scale_shape_manual(name = "Ethnicities", labels = c("Single", "Mixed"), values = c(16,17) ) +
#  geom_text(aes(label=ifelse(PC1 > 50 & (PC2 > 50 | PC2 < -50), patientId, "")),hjust=-.5, vjust=-.5, size = 2.5, color = "black") +
  labs(title = plotTitle, x = glue("PC1 - {round(label1, 1)}%"), y = glue("PC2 - {round(label2, 1)}%"))
}

domRecMatrix = pe.getNumericAlleleMatrix()
ress = lapply(levels(clinical$ethnicityCategory), function (name) {
  groupIds = clinical$ethnicityCategory == name
  pca = prcomp(domRecMatrix[groupIds,])
  plotTemplateDiseaseEth(pca, glue("PCA of SNPs for {name} Ethnicity shows no clustering by disease"), clinical[groupIds,])
})
ress
[[1]]


[[2]]


[[3]]


[[4]]

source("setup.r")
[1] "Notice: cache already initialized. Keeping existing cache."
[conflicted] Removing existing preference
[conflicted] Will prefer dplyr::filter over any other package
qqPlot(pe.getPValsAll()$minPval, main = "QQPlot of p-values is consistent with uniform null distribution")

source("setup.r")
[1] "Notice: cache already initialized. Keeping existing cache."
[conflicted] Removing existing preference
[conflicted] Will prefer dplyr::filter over any other package

Look at the top SNPs before correction

res = lapply(c("dominant", "codominant", "recessive", "overdominant", "log-additive"), function (modelName) {
  cache.get(modelName, function () stop("Use thread.r to compute pvals.")) %>% as.data.frame %>% rownames_to_column(var = "snp") %>% arrange_at(3) %>% dplyr::slice(1:10) %>% select(-one_of("comments"))
})

res
[[1]]
          snp     dominant
1   rs7804641 2.271119e-05
2     rs30117 4.810937e-05
3  rs10482796 1.335995e-04
4   rs6604616 1.366858e-04
5  rs11774014 2.088724e-04
6    rs209549 2.138686e-04
7   rs4512191 2.138686e-04
8   rs1946016 2.169683e-04
9   rs1728232 2.341614e-04
10  rs4822422 2.341614e-04

[[2]]
             snp   codominant
1  chr5.32898472 2.831337e-05
2  chr5.32888767 3.405689e-05
3  chr5.32897461 4.118232e-05
4  chr5.32901811 4.118232e-05
5  chr5.32889877 9.170883e-05
6  chr5.32903017 9.170883e-05
7       rs209549 1.028533e-04
8      rs7804641 1.069225e-04
9       rs516451 1.222002e-04
10     rs6604616 1.366858e-04

[[3]]
             snp    recessive
1  chr5.32898472 4.742674e-06
2  chr5.32888767 1.148135e-05
3  chr5.32897461 1.719846e-05
4  chr5.32901811 1.719846e-05
5  chr5.32889877 4.953733e-05
6  chr5.32903017 4.953733e-05
7      rs1147227 4.953733e-05
8       rs516451 4.953733e-05
9  chr5.32884755 1.366858e-04
10 chr5.32890721 1.366858e-04

[[4]]
              snp overdominant
1       rs7804641 6.020586e-05
2  chr11.48008018 9.513349e-05
3        rs817959 1.335995e-04
4      rs17773842 1.366858e-04
5       rs6604616 1.366858e-04
6      rs12518851 1.389169e-04
7       rs6061194 1.389896e-04
8       rs3104996 1.535729e-04
9      rs12502984 1.570357e-04
10  chr7.14684243 1.798556e-04

[[5]]
             snp log-additive
1       rs209549 2.167011e-05
2  chr5.32897461 2.712220e-05
3  chr5.32901811 2.712220e-05
4  chr5.32888767 2.809238e-05
5        rs30117 3.956550e-05
6  chr5.32889877 4.689104e-05
7  chr5.32903017 4.689104e-05
8       rs516451 8.178124e-05
9     rs10482796 8.805029e-05
10     rs7804641 1.069225e-04

Look at the top SNPs after correction

res = lapply(c("dominant", "codominant", "recessive", "overdominant", "log-additive"), function (modelName) {
  Bonferroni.sig(cache.get(modelName, function () stop("Use thread.r to compute pvals.")), model = modelName, alpha = 0.05, include.all.SNPs=FALSE)
})
number of tests:  132667 
alpha: 0.05 
corrected alpha: 3.768835e-07 
   No significant SNPs after Bonferroni correction 
number of tests:  132709 
alpha: 0.05 
corrected alpha: 3.767642e-07 
   No significant SNPs after Bonferroni correction 
number of tests:  132692 
alpha: 0.05 
corrected alpha: 3.768125e-07 
   No significant SNPs after Bonferroni correction 
number of tests:  132669 
alpha: 0.05 
corrected alpha: 3.768778e-07 
   No significant SNPs after Bonferroni correction 
number of tests:  132697 
alpha: 0.05 
corrected alpha: 3.767983e-07 
   No significant SNPs after Bonferroni correction 
source("setup.r")
[1] "Notice: cache already initialized. Keeping existing cache."
[conflicted] Removing existing preference
[conflicted] Will prefer dplyr::filter over any other package

Let’s read in the clinical data

clinicalRaw = read_csv(pe.clinicalDataPath, col_types = cols())
nrow(clinicalRaw)
[1] 109
head(clinicalRaw)

Filter Clinical

49 of the mothers were excluded due to lack of adequate maternal sample. Exclude these from clinical

clinical = pe.getClinicalInfo()
skim_tee(clinical)
Skim summary statistics
 n obs: 60 
 n variables: 21 

── Variable type:character ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
   variable missing complete  n min max empty n_unique
 ethnicity1       0       60 60   6  12     0       11
 ethnicity2      37       23 60   5  15     0       10
 ethnicity3      44       16 60   7  19     0        8

── Variable type:factor ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
          variable missing complete  n n_unique
 ethnicityCategory       0       60 60        4
                       top_counts ordered
 Fil: 27, Cau: 19, Asi: 7, Pac: 7   FALSE

── Variable type:integer ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
             variable missing complete  n   mean    sd p0    p25   p50
 3HrGlucose1HrGlucola      24       36 60 126.06 28.61 85 103    121.5
            abruption       0       60 60   0.13  0.34  0   0      0  
             daughter       0       60 60   0.52  0.5   0   0      1  
                  gdm       0       60 60   0.1   0.3   0   0      0  
                 iugr       0       60 60   0.15  0.36  0   0      0  
                mixed       0       60 60   0.38  0.49  0   0      0  
               momAge       0       60 60  29.6   5.71 18  26     31  
            patientId       0       60 60  57.05 31.08  2  32.75  59.5
         preeclampsia       0       60 60   0.52  0.5   0   0      1  
         primaGravida       0       60 60   0.35  0.48  0   0      0  
    p75 p100     hist
 145.75  195 ▇▅▇▅▃▂▃▁
   0       1 ▇▁▁▁▁▁▁▁
   1       1 ▇▁▁▁▁▁▁▇
   0       1 ▇▁▁▁▁▁▁▁
   0       1 ▇▁▁▁▁▁▁▂
   1       1 ▇▁▁▁▁▁▁▅
  34      42 ▅▂▃▆▇▇▁▂
  82.25  102 ▅▃▃▅▃▅▅▇
   1       1 ▇▁▁▁▁▁▁▇
   1       1 ▇▁▁▁▁▁▁▅

── Variable type:numeric ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
          variable missing complete  n    mean      sd    p0     p25
       babyWgtGram       0       60 60 2398.94 1013.13 730   1552.5 
               bmi       5       55 60   25.45    6.41  18.6   21.2 
 ethnicityPercent1       9       51 60   78.71   31.9    1     50   
 ethnicityPercent2      51        9 60   31.11   18.96   5     12.5 
 ethnicityPercent3      55        5 60   29.5    13.39  12.5   25   
           gestAge       0       60 60   35.66    4.04  27     32.88
   placentaWgtGram      33       27 60  351.43  105.06 165    266.5 
     p50     p75   p100     hist
 2386    3345.75 4193   ▅▆▇▃▂▅▇▃
   23.2    26.95   48.5 ▇▅▂▂▁▁▁▁
  100     100     100   ▁▁▁▃▁▁▁▇
   25      50      50   ▂▃▁▃▁▁▁▇
   25      37.5    47.5 ▃▁▇▁▁▃▁▃
   34.75   39.4    41.2 ▂▂▂▆▃▁▇▆
  366     424     533   ▃▃▂▃▇▂▃▃

Now let’s create a numerical version of the clinical data for PCA and other methods

clinicalNumeric = pe.getClinicalAsNumerics()
clinicalNumeric[1:5,]

Let’s see any correlations in the clinical data

In the plot below, we see only expected correlations.

  • Preeclampsia negatively correlates with gestAge and babyWgtGram
  • If it’s your first baby (primaGravida) you are likely younger (momAge)
  • If your baby has IUGR it weighs less (babyWgtGram)
  • IUGR and preeclampsia are positively correlated
  • A longer gestAge correlates with a heavier birthweight (babyWgtGram)
corrplot(cor(clinicalNumeric), method="circle", type="upper")

We see correlation between ethnicities, as we would expect.

Let’s check correlation of patients just as a sanity check. We should not see anything too extreme.

# todo: try different row/column removal strategies
corrplot(pe.getNumericAlleleMatrix() %>% t %>% cor, method="circle", type="upper")

source("setup.r")
[1] "Notice: cache already initialized. Keeping existing cache."
[conflicted] Removing existing preference
[conflicted] Will prefer dplyr::filter over any other package

Let’s generate a Phylogenetic Tree

tree = cache.get("phylotree", function() {
  write_tsv(pe.getSimpleSNPFormatForPhyloTree(), "../output/simplesnp.tsv")
  
  # Now run snphylo: http://chibba.pgml.uga.edu/snphylo/
  # We assume it is stored in the ignore folder
  system("../ignore/SNPhylo/snphylo.sh -s ../output/simplesnp.tsv -P ../output/snphylo")
  read.tree("../output/snphylo.ml.tree")
})
clinical = pe.getClinicalInfo()
treePlot = clinical %>% mutate(Ethnicity = clinical$ethnicityCategory, Ethnicities = factor(mixed, labels = c("Single", "Mixed")))
treePlot$patientId = glue("Sample{treePlot$patientId}")
ggtree(tree, color = "grey") %<+% treePlot +
  geom_tiplab(aes(color=Ethnicity, label = Ethnicity), show.legend = FALSE) +
  geom_tippoint(size = 2, aes(color=Ethnicity, shape = Ethnicities), alpha=1) +
  labs(title = "QC: Phylogenetic Tree from alignment with self-reported ethnicity") +
  theme(legend.position="right")

source("setup.r")
[1] "Notice: cache already initialized. Keeping existing cache."
[conflicted] Removing existing preference
[conflicted] Will prefer dplyr::filter over any other package
library(Rtsne)
library(scatterplot3d)
library(rgl)
clinical = pe.getClinicalInfo()
domRecMatrixNoInvariants = pe.getNumericAlleleMatrix()
tsne <- Rtsne(domRecMatrixNoInvariants, dims = 2, max_iter = 500, perplexity = 3)
df = data.frame(d1 = tsne$Y[,1], d2 = tsne$Y[,2], preeclampsia = clinical$preeclampsia)
ggplot(df, aes(x=d1, y=d2, color=as.factor(preeclampsia))) + geom_point(size = 3) +
  scale_color_manual(values = brewer.pal(n = 8, name = "Set1"), labels = c("E.O. Preeclampsia", "Control"), name = "Group") +
  labs(title = "No clustering observed in tSNE")

# Let's try in 3D
tsne <- Rtsne(domRecMatrixNoInvariants, dims = 3, max_iter = 500, perplexity = 3)
scatterplot3d(x=tsne$Y[,1],y=tsne$Y[,2],z=tsne$Y[,3], color = c("red","blue")[clinical$preeclampsia + 1])

Let’s try interactive CSV

tsne <- Rtsne(domRecMatrixNoInvariants, dims = 3, max_iter = 500, perplexity = 3)
plot3d(x=tsne$Y[,1],y=tsne$Y[,2],z=tsne$Y[,3], col = c("red","blue")[clinical$preeclampsia + 1], type="s",radius=3)

Let’s look at Ethnicity

tsne <- Rtsne(domRecMatrixNoInvariants, dims = 3, max_iter = 500, perplexity = 3)
plot3d(x=tsne$Y[,1],y=tsne$Y[,2],z=tsne$Y[,3], col = c("red","blue", "green", "orange")[as.integer(clinical$ethnicityCategory)], type="s",radius=3)
source("setup.r")
[1] "Notice: cache already initialized. Keeping existing cache."
[conflicted] Removing existing preference
[conflicted] Will prefer dplyr::filter over any other package
snpdata <- pe.getSnpData()
head(snpdata)

Let’s look at the entire SNP data and see what we find

allele1 = summary(as.factor(snpdata$allele1))
allele2 = summary(as.factor(snpdata$allele2))
res = data.frame(baseCount = allele1, allele = "allele1", base = names(allele1))
res2 = data.frame(baseCount = allele2, allele = "allele2", base = names(allele2))
sums = rbind(res, res2)
ggplot(sums, aes(x = base, fill = allele, y = baseCount)) + geom_bar(position = "dodge", stat = "sum") + labs(title = "Nucleotide Occurrence in SNP dataset") + scale_y_continuous(labels = comma)

sums

Is there any difference in base pair frequency amongst the samples?

bySample1 = snpdata %>% select(sampleId, allele1) %>% group_by(sampleId, allele1) %>% summarise(n = n())
ggplot(bySample1, aes(x = sampleId, y = n, color = allele1)) + geom_point()

ggplot(bySample1, aes(x = allele1, y = n, color = allele1)) + geom_boxplot()

bySample2 = snpdata %>% select(sampleId, allele2) %>% group_by(sampleId, allele2) %>% summarise(n = n())
ggplot(bySample2, aes(x = sampleId, y = n, color = allele2)) + geom_point()

ggplot(bySample2, aes(x = allele2, y = n, color = allele2)) + geom_boxplot()

What are those outliers? Let’s look at the call rates

All of the outliers are with patient 78. Let’s see the avg gen call score by patient:

# There are 8 values with a NaN for gencall score. We can set those to 0.0 because they are all uncalled bases ("--")..
snpdata[is.na(snpdata$avgGenCallScore),]["avgGenCallScore"] = 0
scoresByPatient = snpdata %>% select(sampleId, avgGenCallScore) %>% group_by(sampleId) %>% summarise(average = mean(avgGenCallScore))
summary(scoresByPatient$average)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.7245  0.7798  0.7892  0.7844  0.7903  0.7909 
scoresByPatient$preeclampsia = clinical %>% arrange(patientId) %>% select(preeclampsia) %>% unlist %>% as.factor
ggplot(scoresByPatient, aes(y = average, x = sampleId, color = preeclampsia)) + geom_point() + geom_label(aes(label = sampleId))

Takeaways

  • Patient 78 has low quality reads.

Let’s make sure there are no noticeable difference in calls between the case and control groups

a1 = snpdata %>% group_by(allele1, sampleId) %>% summarise(count = n()) %>% spread(allele1, count)
a2 = snpdata %>% group_by(allele2, sampleId) %>% summarise(count = n()) %>% spread(allele2, count) %>% select(-sampleId)
colnames(a2) = c("-2", "A2", "C2", "G2", "T2")
aBoth= cbind(a1, a2)
aBoth$preeclampsia = clinical$preeclampsia
aBoth = aBoth %>% select(-sampleId) %>% group_by(preeclampsia) %>% summarise_all(funs(mean))
aBoth
aBoth2 = t(aBoth)
colnames(aBoth2) = c("control", "preeclampsia")
aBoth2 = as.data.frame(aBoth2)
aBoth2 = aBoth2[-1,]
aBoth2$base = unlist(rownames(aBoth2))
aBoth2 = aBoth2 %>% gather('control', 'preeclampsia', key = 'group', value = 'counts')
ggplot(aBoth2, aes(x = base, fill = group, y = counts)) + geom_bar(position = "dodge", stat = "sum") + labs(title = "Nucleotide Occurrence by Case/Control Group") + scale_y_continuous(labels = comma)

Takeaway: No difference found.

Let’s look at the variance in SNPS from both alleles combined

simpleSNP = snpdata
simpleSNP$avgGenCallScore = NULL
simpleSNP = simpleSNP %>% unite(alleles, c(allele1, allele2), sep = "")
colnames(simpleSNP) = c("location", "sample", "alleles")
simpleSNP = simpleSNP %>% spread(sample, alleles, sep = "")
head(simpleSNP)
countOfVariantsPerLocation <- apply(simpleSNP[,-1], 1, function(x)length(unique(x)))
histogram(countOfVariantsPerLocation, col=brewer.pal(n = 3, name = "Set1")[2])

data.frame(countOfVariantsPerLocation = countOfVariantsPerLocation) %>% group_by(countOfVariantsPerLocation) %>% summarise(n=n())

So there is no variation in 60,000 of these locations.

In 16 loci Illumina has 2 measurements

duplicates = pe.getDuplicateMeasurementLocations()
duplicates

Let’s see what’s happening at these spots and if it is safe to drop one set

fullSet = pe.getSnpsWithChipInfo()
head(fullSet)
dupeMeasurements = fullSet %>% select(sampleId, allele1, allele2, Chr, MapInfo) %>% mutate(id = paste(Chr, MapInfo)) %>% filter(id %in% duplicates$id)
dupeMeasurements[1:5,]

Takeaway

There is variation at these SNPs. We’ll need to think about how to handle

phyloSnp = pe.getSimpleSNPFormatForPhyloTree()
chrData = phyloSnp %>% group_by(`#Chrom`) %>% summarise(loci = n()) %>% mutate(Chr = 1:25) %>% arrange(Chr)
ggplot(chrData, aes(x = Chr, y = loci, label  = `#Chrom`)) + geom_col() + geom_label()

chrData

Takeaway

We confirm there are no Y chromosome measurements made.

Let’s see how often the variations vary for each allele

alleleCounts = snpdata %>% select(illuminaSnpLocation, allele1) %>% group_by(illuminaSnpLocation) %>% summarise(numberOfAlleles = length(unique(allele1)))
alleleCounts$snpIndex = seq.int(nrow(alleleCounts))
ggplot(alleleCounts, aes(x = snpIndex, y = numberOfAlleles)) + geom_hex( bins = 30) + scale_y_continuous(breaks = c(1,2,3)) + labs(title = "Density of SNP variation for Allele 1")

alleleCounts = snpdata %>% select(illuminaSnpLocation, allele2) %>% group_by(illuminaSnpLocation) %>% summarise(numberOfAlleles = length(unique(allele2)))
alleleCounts$snpIndex = seq.int(nrow(alleleCounts))
ggplot(alleleCounts, aes(x = snpIndex, y = numberOfAlleles)) + geom_hex( bins = 30) + scale_y_continuous(breaks = c(1,2,3)) + labs(title = "Density of SNP variation for Allele 2")

Let’s see why we have more than 2 nucleotides found in a base pair

alleles1 = snpdata %>% mutate(allele = allele1) %>% select(illuminaSnpLocation, allele) %>%
  group_by(illuminaSnpLocation) %>% mutate(allele = paste(unique(allele), collapse = "")) %>%
  group_by(allele) %>% summarise(count = n()) %>% mutate(len = str_length(allele), count = count/60) %>%
  arrange(desc(len)) %>% select(count, allele)


alleles2 = snpdata %>% mutate(allele = allele2) %>% select(illuminaSnpLocation, allele) %>%
  group_by(illuminaSnpLocation) %>% mutate(allele = paste(unique(allele), collapse = "")) %>%
  group_by(allele) %>% summarise(count = n()) %>% mutate(len = str_length(allele), count = count/60) %>%
  arrange(desc(len)) %>% select(count, allele)

both = cbind(alleles1, alleles2[-2])
colnames(both) = c("allele1count", "nucleotides", "allele2count")
both

Takeaway: 3rd read is just noncalls.

source("setup.r")
[1] "Notice: cache already initialized. Keeping existing cache."
[conflicted] Removing existing preference
[conflicted] Will prefer dplyr::filter over any other package

Some notes about this Illumina Dataset

The data comes from the following:

Illumina
Descriptor File Name: Cardio-Metabo_Chip_11395247_A.bpm
Assay Format: Infinium HD Ultra
Date Manufactured: 9/8/2009
Loci Count: 196725
# Let's look at the raw illumina data about this metabochip
file.head(pe.illuminaPath, 10)
Illumina, Inc.,,,,,,,,,,,,,,,,,
[Heading],,,,,,,,,,,,,,,,,,
Descriptor File Name,Cardio-Metabo_Chip_11395247_A.bpm,,,,,,,,,,,,,,,,,
Assay Format,Infinium HD Ultra,,,,,,,,,,,,,,,,,
Date Manufactured,9/8/2009,,,,,,,,,,,,,,,,,
Loci Count ,196725,,,,,,,,,,,,,,,,,
[Assay],,,,,,,,,,,,,,,,,,
IlmnID,Name,IlmnStrand,SNP,AddressA_ID,AlleleA_ProbeSeq,AddressB_ID,Allele
chr1:109457160-0_B_R_1602772855,chr1:109457160,BOT,[T/G],41769450,AGCCTTTT
chr1:109457233-0_B_F_1602777793,chr1:109457233,BOT,[T/G],11615356,CCTTCTCC
metaboChip = pe.getMetaboChip()
head(metaboChip)
skim_tee(metaboChip)
Skim summary statistics
 n obs: 196725 
 n variables: 19 

── Variable type:character ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
         variable missing complete      n min max empty n_unique
 AlleleA_ProbeSeq       0   196725 196725  50  50     0   196701
 AlleleB_ProbeSeq  173712    23013 196725  50  50     0    23009
           IlmnID       0   196725 196725  23  36     0   196725
       IlmnStrand       0   196725 196725   3   3     0        2
             Name       0   196725 196725   5  17     0   196725
           Ploidy       0   196725 196725   7   7     0        1
              SNP       0   196725 196725   5   5     0       10
           Source       0   196725 196725   7  23     0        7
        SourceSeq       0   196725 196725  59 125     0   196711
     SourceStrand       0   196725 196725   3   3     0        2
          Species       0   196725 196725  12  12     0        1
    TopGenomicSeq       0   196725 196725  59 125     0   196705

── Variable type:factor ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
 variable missing complete      n n_unique
      Chr       0   196725 196725       26
                              top_counts ordered
 6: 21966, 1: 18720, 11: 16108, 2: 15750   FALSE

── Variable type:integer ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      variable missing complete      n        mean       sd        p0
   AddressA_ID       0   196725 196725 4e+07        1.8e+07   1.1e+07
   AddressB_ID  173712    23013 196725     4.2e+07  1.9e+07   1.1e+07
     BeadSetID       0   196725 196725   224.28     0.96    223      
   GenomeBuild       0   196725 196725    35.88     1.44     18      
       MapInfo       0   196725 196725     7.3e+07  5.5e+07 923      
 SourceVersion       0   196725 196725    52.49    71.59      0      
       p25         p50       p75      p100     hist
   2.5e+07     3.9e+07   5.5e+07   7.5e+07 ▇▇▇▇▇▆▅▆
   2.6e+07     4.2e+07   5.8e+07   7.5e+07 ▇▇▇▇▇▇▇▇
 223         225       225       225       ▅▁▁▁▁▁▁▇
  36          36        36        92       ▁▇▁▁▁▁▁▁
   2.8e+07 6e+07         1.1e+08   2.5e+08 ▇▆▅▅▂▂▁▁
   0           0       128       978       ▇▅▁▁▁▁▁▁

Let’s attempt to build a model that can predict EO Preeclampsia from SNP data alone (spoiler alert: no success)

source("setup.r")
[1] "Notice: cache already initialized. Keeping existing cache."
[conflicted] Removing existing preference
[conflicted] Will prefer dplyr::filter over any other package
library(xgboost)
library(caTools)
conflict_prefer("union", "dplyr")
[conflicted] Will prefer dplyr::union over any other package
conflict_prefer("colnames", "base")
[conflicted] Will prefer base::colnames over any other package
conflict_prefer("as.numeric", "base")
[conflicted] Will prefer base::as.numeric over any other package
clinical = pe.getClinicalInfo()
domRecMatrix = pe.getNumericAlleleMatrix()
domRecMatrix[1:5,1:5]
     chr1_156843624 chr1_156843733 chr1_156850715 chr1_156853992
[1,]              1             -1              1              1
[2,]              1              1              0              1
[3,]              1              1              0              1
[4,]              1              1              0              1
[5,]              1              1              1              1
     chr1_156854468
[1,]              1
[2,]              1
[3,]              1
[4,]              1
[5,]              1
draw_confusion_matrix <- function(cm) {
  
  total = sum(cm$table)
  res <- as.numeric(cm$table)
  
  # Generate color gradients. Palettes come from RColorBrewer.
  greenPalette = c("#F7FCF5","#E5F5E0","#C7E9C0","#A1D99B","#74C476","#41AB5D","#238B45","#006D2C","#00441B")
  redPalette = c("#FFF5F0","#FEE0D2","#FCBBA1","#FC9272","#FB6A4A","#EF3B2C","#CB181D","#A50F15","#67000D")
  getColor = function (greenOrRed = "green", amount = 0) {
    if (amount == 0)
      return("#FFFFFF")
    palette = greenPalette
    if (greenOrRed == "red")
      palette = redPalette
    colorRampPalette(palette)(100)[10 + ceiling(90 * amount / total)]
  }

  # set the basic layout
  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('CONFUSION MATRIX', cex.main=2)
  
  # create the matrix 
  classes = colnames(cm$table)
  rect(150, 430, 240, 370, col=getColor("green", res[1]))
  text(195, 435, classes[1], cex=1.2)
  rect(250, 430, 340, 370, col=getColor("red", res[3]))
  text(295, 435, classes[2], cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col=getColor("red", res[2]))
  rect(250, 305, 340, 365, col=getColor("green", res[4]))
  text(140, 400, classes[1], cex=1.2, srt=90)
  text(140, 335, classes[2], cex=1.2, srt=90)
  
  # add in the cm results
  text(195, 400, res[1], cex=1.6, font=2, col='white')
  text(195, 335, res[2], cex=1.6, font=2, col='white')
  text(295, 400, res[3], cex=1.6, font=2, col='white')
  text(295, 335, res[4], cex=1.6, font=2, col='white')
  
  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
  text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
  text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
  text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
  text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
  
  # add in the accuracy information 
  text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
  text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}
combined = cbind(clinical %>% select(preeclampsia), domRecMatrix)
combined$preeclampsia = as.factor(combined$preeclampsia)

split <- sample.split(clinical$preeclampsia, SplitRatio = 0.8)
training_set = subset(combined, split == TRUE)
validation_set = subset(combined, split == FALSE)

dTrain <- xgb.DMatrix(as.matrix(training_set[-1]), label = as.matrix(training_set[1]))
dValidation <- xgb.DMatrix(as.matrix(validation_set[-1]), label = as.matrix(validation_set[1]))
dwatchlist <- list(eval = dValidation, train = dTrain)

params <- list(eta = 0.1, max_depth = 2,  objective = "binary:logistic")
classifier <- xgb.train(data = dTrain, params = params, save_period = 100, watchlist = dwatchlist, 
  label = training_set[1], early_stopping_rounds = 5, nrounds = 50)
[1] eval-error:0.500000 train-error:0.020833 
Multiple eval metrics are present. Will use train_error for early stopping.
Will train until train_error hasn't improved in 5 rounds.

[2] eval-error:0.666667 train-error:0.020833 
[3] eval-error:0.416667 train-error:0.020833 
[4] eval-error:0.666667 train-error:0.000000 
[5] eval-error:0.666667 train-error:0.000000 
[6] eval-error:0.583333 train-error:0.000000 
[7] eval-error:0.666667 train-error:0.000000 
[8] eval-error:0.500000 train-error:0.000000 
[9] eval-error:0.583333 train-error:0.000000 
Stopping. Best iteration:
[4] eval-error:0.666667 train-error:0.000000
y_pred <- predict(classifier, newdata = as.matrix(validation_set[-1])) %>%  round

actual = as.data.frame(validation_set)[, 1]
importance = xgb.importance(colnames(combined)[-1], model = classifier)
ggplot(importance[1:10,], aes(x = Feature, y = Gain)) + geom_col()

cm = caret::confusionMatrix(factor(y_pred), factor(actual))
draw_confusion_matrix(cm)

print(importance[1:10,])
            Feature       Gain      Cover  Frequency
 1:       rs2652492 0.34887673 0.33553125 0.22222222
 2:       rs4736440 0.14551246 0.16446875 0.11111111
 3:       rs6034593 0.09757756 0.08659054 0.11111111
 4:  chr1:119340228 0.07513690 0.06663553 0.07407407
 5:      rs12193094 0.05426129 0.02975212 0.03703704
 6: chr13:109780171 0.04842820 0.02975212 0.03703704
 7:  chr1:119258927 0.03764861 0.02956836 0.03703704
 8:       rs2039465 0.03372908 0.03306453 0.03703704
 9:   chr8:19788876 0.03069535 0.02866063 0.03703704
10:      rs13078384 0.02808678 0.02793489 0.03703704
cm
Confusion Matrix and Statistics

          Reference
Prediction 0 1
         0 1 3
         1 5 3
                                          
               Accuracy : 0.3333          
                 95% CI : (0.0992, 0.6511)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : 0.9270          
                                          
                  Kappa : -0.3333         
 Mcnemar's Test P-Value : 0.7237          
                                          
            Sensitivity : 0.16667         
            Specificity : 0.50000         
         Pos Pred Value : 0.25000         
         Neg Pred Value : 0.37500         
             Prevalence : 0.50000         
         Detection Rate : 0.08333         
   Detection Prevalence : 0.33333         
      Balanced Accuracy : 0.33333         
                                          
       'Positive' Class : 0               
                                          

Try cross validation

clinical = pe.getClinicalInfo()
domRecMatrix = pe.getNumericAlleleMatrix()
params <- list(eta = 0.02, max_depth = 2,  objective = "binary:logistic")
booster = xgb.cv(params, domRecMatrix, 30, nfold = 5, label = clinical$preeclampsia)
[1] train-error:0.062507+0.022822   test-error:0.568532+0.105236 
[2] train-error:0.041582+0.034912   test-error:0.553147+0.102857 
[3] train-error:0.045837+0.030617   test-error:0.523660+0.136221 
[4] train-error:0.041582+0.034912   test-error:0.522378+0.123573 
[5] train-error:0.037415+0.035883   test-error:0.539044+0.113392 
[6] train-error:0.033248+0.038668   test-error:0.522378+0.123573 
[7] train-error:0.033248+0.038668   test-error:0.555711+0.112550 
[8] train-error:0.033248+0.038668   test-error:0.522378+0.123573 
[9] train-error:0.033248+0.038668   test-error:0.555711+0.112550 
[10]    train-error:0.020748+0.018635   test-error:0.539044+0.113392 
[11]    train-error:0.016582+0.015569   test-error:0.539044+0.113392 
[12]    train-error:0.016582+0.015569   test-error:0.539044+0.113392 
[13]    train-error:0.012415+0.016625   test-error:0.539044+0.113392 
[14]    train-error:0.012415+0.016625   test-error:0.539044+0.113392 
[15]    train-error:0.012415+0.016625   test-error:0.539044+0.113392 
[16]    train-error:0.008248+0.010103   test-error:0.555711+0.112550 
[17]    train-error:0.008248+0.010103   test-error:0.572378+0.121241 
[18]    train-error:0.008248+0.010103   test-error:0.572378+0.121241 
[19]    train-error:0.008248+0.010103   test-error:0.572378+0.121241 
[20]    train-error:0.004082+0.008163   test-error:0.589044+0.115749 
[21]    train-error:0.004082+0.008163   test-error:0.572378+0.121241 
[22]    train-error:0.004082+0.008163   test-error:0.555711+0.112550 
[23]    train-error:0.004082+0.008163   test-error:0.572377+0.109186 
[24]    train-error:0.004082+0.008163   test-error:0.572378+0.121241 
[25]    train-error:0.004082+0.008163   test-error:0.589044+0.115749 
[26]    train-error:0.004082+0.008163   test-error:0.572378+0.121241 
[27]    train-error:0.004082+0.008163   test-error:0.589044+0.115749 
[28]    train-error:0.004082+0.008163   test-error:0.572378+0.121241 
[29]    train-error:0.004082+0.008163   test-error:0.589044+0.115749 
[30]    train-error:0.004082+0.008163   test-error:0.589044+0.115749 
booster$evaluation_log[booster$best_iteration]$test_error_mean
NULL

Let’s try to predict EOPE on clinical data as a sanity check. We expect 0 error/100% accuracy.

clinicalNumeric = as.matrix(pe.getClinicalAsNumerics())
training_set = subset(clinicalNumeric, split == TRUE)
validation_set = subset(clinicalNumeric, split == FALSE)
dTrain <- xgb.DMatrix(training_set[,-1], label = training_set[,1])

params <- list(eta = 0.3, max_depth = 1, objective = "binary:logistic")
classifier <- xgb.train(params, dTrain, 20)

y_pred <- predict(classifier, newdata = validation_set[,-1]) %>% round

actual = as.data.frame(validation_set)[, 1]
importance = xgb.importance(colnames(clinicalNumeric)[-1], model = classifier)
ggplot(importance[1:10,], aes(x = Feature, y = Gain)) + geom_col()
Warning: Removed 9 rows containing missing values (position_stack).

cm = caret::confusionMatrix(factor(y_pred), factor(actual))
draw_confusion_matrix(cm)

print(importance[1:10,])
    Feature Gain Cover Frequency
 1: gestAge    1     1         1
 2:    <NA>   NA    NA        NA
 3:    <NA>   NA    NA        NA
 4:    <NA>   NA    NA        NA
 5:    <NA>   NA    NA        NA
 6:    <NA>   NA    NA        NA
 7:    <NA>   NA    NA        NA
 8:    <NA>   NA    NA        NA
 9:    <NA>   NA    NA        NA
10:    <NA>   NA    NA        NA
cm
Confusion Matrix and Statistics

          Reference
Prediction 0 1
         0 6 0
         1 0 6
                                     
               Accuracy : 1          
                 95% CI : (0.7354, 1)
    No Information Rate : 0.5        
    P-Value [Acc > NIR] : 0.0002441  
                                     
                  Kappa : 1          
 Mcnemar's Test P-Value : NA         
                                     
            Sensitivity : 1.0        
            Specificity : 1.0        
         Pos Pred Value : 1.0        
         Neg Pred Value : 1.0        
             Prevalence : 0.5        
         Detection Rate : 0.5        
   Detection Prevalence : 0.5        
      Balanced Accuracy : 1.0        
                                     
       'Positive' Class : 0          
                                     
# rmarkdown::render('main.rmd')
LS0tCnRpdGxlOiAiTWF0ZXJuYWwgQ2FyZGlvdmFzY3VsYXItUmVsYXRlZCBTaW5nbGUgTnVjbGVvdGlkZSBQb2x5bW9ycGhpc21zLCBHZW5lcywgYW5kIEdlbmUgUmVndWxhdG9yeSBOZXR3b3JrcyBBc3NvY2lhdGVkIHdpdGggRWFybHktT25zZXQgUHJlZWNsYW1wc2lhIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgojIE1hdGVybmFsIENhcmRpb3Zhc2N1bGFyLVJlbGF0ZWQgU2luZ2xlIE51Y2xlb3RpZGUgUG9seW1vcnBoaXNtcywgR2VuZXMsIGFuZCBHZW5lIFJlZ3VsYXRvcnkgTmV0d29ya3MgQXNzb2NpYXRlZCB3aXRoIEVhcmx5LU9uc2V0IFByZWVjbGFtcHNpYQoKIyMgUHJlZWNsYW1wc2lhIHJldmlldwoKLSBFYXJseSBzaWducyBvZiBwcmVlY2xhbXBzaWEgaW5jbHVkZSBoaWdoIGJsb29kIHByZXNzdXJlIGFuZCBwcm90ZWluIGluIHVyaW5lCi0gQ2F1c2VkIGJ5IHRoZSBwbGFjZW50YQotIDIgLSA4JSB3b3JsZHdpZGUKLSA2M2sgZGVhdGhzIHdvcmxkd2lkZQotIEVhcmxpZXIgb2NjdXJyZW5jZSBhc3NvY2lhdGVkIHdpdGggd29yc2Ugb3V0Y29tZXMKLSA1MCB3b21lbiBwZXIgeWVhciBkaWUgaW4gVVMgb3V0IG9mIDRNIGJpcnRocwoKIyMgRWFybHktT25zZXQgUHJlZWNsYW1wc2lhIChFT1BFKQotIEEgc3Vic2V0IG9mIFBFCi0gRGlzdGluZ3Vpc2hpbmcgY2hhcmFjdGVyaXN0aWNzOgotLSBEZXZlbG9wcyBiZWZvcmUgMzQgd2Vla3Mgb2YgZ2VzdGF0aW9uLiAoTGF0ZSBPbnNldCBpcyBhZnRlciAzNyB3ZWVrcykKLS0gRU9QRSBpcyBvZnRlbiBjb21wbGljYXRlZCBieSBJVUdSIChJbnRlcnV0ZXJpbmUgR3Jvd3RoIFJlc3RyaWN0aW9uKQotLSBFT1BFIGhhcyBjbGVhciBwbGFjZW50YWwgcGF0aG9sb2d5Ci0tIExlaXNpb24gb2YgdXRlcmFscGxhY2VudGFsIG1hbHBlcmZ1c2lvbgoKIyMgSWRlYQoKLSBFT1BFIG1heSBoYXZlIGFuIHVuZGVybHlpbmcgZ2VuZXRpYyBwcmVkaXNwb3NpdGlvbiwgKHN0dWRpZXMgc2hvdyBmYW1pbHkgaGlzdG9yeSBhIGZhY3RvcikuCi0gTWFueSBwcmV2aW91cyBnZW5ldGljIGludmVzdGlnYXRpb25zIGhhdmUgbG9va2VkIGF0IGdlbmVzIGludm9sdmVkIGluIH4xMCBwcm9jZXNzZXMgc3VjaCBhcyBhbmdpb2dlbmVzaXMgYW5kIGxpcGlkIG1ldGFib2xpc20KLSBUaGVyZSBhcmUgc29tZSBrbm93biBhc3NvY2lhdGlvbnMgYmV0d2VlbiBwcmVlY2xhbXBzaWEgYW5kIGNhcmRpb3Zhc2N1bGFyIGFuZCBtZXRhYm9saWMgZGlzZWFzZSBzdWNoIGFzIGNocm9uaWMgaHlwZXJ0ZW5zaW9uIGFuZCBkaWFiZXRlcywgYW5kIHdvbWVuIHdpdGggYSBoaXN0b3J5IG9mIHByZWVjbGFtcHNpYSBtb3JlIGxpa2VseSB0byBnZXQgdGhlc2UgZGlzZWFzZXMKLSBUaGVyZSBhcmUgcmFjaWFsIGRpZmZlcmVuY2VzIGluIHJhdGVzIG9mIHByZWVjbGFtcHNpYSwgTkhPUEkgYW5kIEZpbGlwaW5vcyBoYXZlIGluY3JlYXNlZCByaXNrCgojIyBPYnNlcnZhdGlvbnMKCi0gNjAgbW90aGVycywgZmlyc3QgZ2VzdGF0aW9uLCB3aG8gZGVsaXZlcmVkIGZ1bGwtdGVybSBub3JtYWwgYmlydGh3ZWlnaHQgaW5mYW50cwotIENhc2VzOiAzMSB3b21lbiB3aXRoIEVPUEUKIC0gRXRobmljYWxseSBkaXZlcnNlCiAgLSAxNSBGaWxpcGlubwogIC0gNiBXaGl0ZQogIC0gNCBvdGhlciBBc2lhbgogIC0gMyBOYXRpdmUgSGF3YWlpYW4gYW5kIG90aGVyIFBhY2lmaWMgSXNsYW5kZXJzCi0gQ29udHJvbHM6IDI5IGNvbnRyb2xzCgojIyBEYXRhIGNvbGxlY3RlZAoKLSBNZXRhYm9DaGlwIHdpdGggMTk2LDcyNTAgU05QcyBhc3NvY2lhdGVkIHdpdGggbWV0YWJvbGljIGFuZCBjYXJkaW92YXNjdWxhciB0cmFpdHMKIC0gQWxsIHJhdyBkYXRhIGlzIGluIGFuIElsbHVtaW5hIEZpbmFsUmVwb3J0IHR4dCBmaWxlCiAgIC0gNDAwTUIgdW5jb21wcmVzc2VkCiAgIC0gVFNWIGZvcm1hdCAoc2tpcCB0aGUgZmlyc3QgOSBsaW5lcykKICAgLSAxMS44TSBsaW5lcy4gMSBsaW5lIGZvciBlYWNoIFNOUC4KLSBDbGluaWNhbCBkYXRhIGluIGNsaW5pY2FsLmNzdgoKIyMgR29hbAoKLSBGaW5kIGFzc29jaWF0aW9ucyB3aXRoIEVPUEUKICAtIEluIFNOUFMgKGNhcmRpb3Zhc2N1bGFyIGFuZCBtZXRhYm9saWMgaW4gcGFydGljdWxhcikKICAtIEdlbmVzCiAgLSBSZWd1bGF0b3J5IHBhdGh3YXlzCgojIyBSZXN1bHRzCgotIE5vIFNOUHMgZm91bmQgdG8gYmUgY29ycmVsYXRlZCAoYWZ0ZXIgY29ycmVjdGlvbnMpCi0gVGhyb3VnaCBnZW5lLWJhc2VkIHRlc3RzOgoKCiMjIFNlY3Rpb25zCgoKYGBge3IgY2hpbGQgPSAnbWFuaGF0dGFuLXBsb3RzLnJtZCd9CmBgYAoKYGBge3IgY2hpbGQgPSAncGNhcy5ybWQnfQpgYGAKCmBgYHtyIGNoaWxkID0gJ3FxcGxvdC5ybWQnfQpgYGAKCmBgYHtyIGNoaWxkID0gJ2d3YXMtdGVzdHMucm1kJ30KYGBgCgpgYGB7ciBjaGlsZCA9ICdjbGluaWNhbC1lZGEucm1kJ30KYGBgCgpgYGB7ciBjaGlsZCA9ICdwaHlsb3RyZWUucm1kJ30KYGBgCgpgYGB7ciBjaGlsZCA9ICd0c25lLnJtZCd9CmBgYAoKYGBge3IgY2hpbGQgPSAnc25wLWVkYS5ybWQnfQpgYGAKCmBgYHtyIGNoaWxkID0gJ21ldGFib2NoaXAtZWRhLnJtZCd9CmBgYAoKYGBge3IgY2hpbGQgPSAneGdib29zdC5ybWQnfQpgYGAKCmBgYHtyfQojIHJtYXJrZG93bjo6cmVuZGVyKCdtYWluLnJtZCcpCmBgYAoK