### Set my working directory to the folder ### where I keep the sample data files for class setwd("~/Desktop/Teaching/CompGenWS-GEN8900/") ### Get the most up to date version of ### the R functions script library(devtools) source_gist("c954c757db02eb6c4ccfae1aa090658c", filename="compGen17_fxns.R", quiet=TRUE) ### Use the read.vcf function to read in ### the sample data file for week 3: my.data=read.vcf(file="SampleData/sampleData_wk3.vcf", header=TRUE, stringsAsFactors=FALSE) ### EXERCISE 1: CALCULATE EXPECTED GENOTYPE COUNTS ### Since the first part of this exercise ### is the same as one of our exercises from ### last week, I'm mostly going to re-use code from ### the week 2 solutions ### But, instead of just copying and pasting, ### I'm going to use that code to make some new functions. ### I've already got a function to get the genotypes, ### now I'm adding a function to count them count.genotypes <- function(genotypes) { genotypes = gsub("(\\||/)", "", genotypes) # First, I'm getting rid of the / or | symbol, so I can be sure the genotypes look the same no matter what gen.patterns = c("00", "01", "10", "11", "..") # Here are all the possible patterns to count my.counts=table(factor(genotypes, levels=gen.patterns)) # Then, I use the same code I had in my solution last week final.counts = c(my.counts[1], (my.counts[2]+my.counts[3]), my.counts[4:5]) # I need to sum together the 2 heterozygote options names(final.counts) = c("AA", "Aa", "aa", "NN") # Finally, I give some names to my vector to make it easy to find a specific count return(final.counts) } ### The next function I'm going to create ### will calculate allele frequencies ### This function will take the genotype ### counts as input, and return p and q allele.freq <- function(genotypeCounts) { n = sum(genotypeCounts) - genotypeCounts["NN"] p = ((2*genotypeCounts["AA"]) + genotypeCounts["Aa"])/(2*n) q = 1-p freqs = c(p,q) names(freqs) = c("p", "q") return(freqs) } ### With my new functions all set up, ### I'm now going to do all of Exercise 1 ### in one loop ### Set up an empty results table blank.col = rep(0, nrow(my.data)) my.results=data.frame(obsAA=blank.col, obsAa=blank.col, obsaa=blank.col, p=blank.col, q=blank.col, N=blank.col, expAA=blank.col, expAa=blank.col, expaa=blank.col) ### Run the loop to do all calculations ### on each row for (i in 1:nrow(my.data)) { my.row = as.vector(my.data[i,], mode="character") my.samples = samples=my.row[10:length(my.row)] my.gens = get.field(samples=my.samples, format=my.row[9], fieldName="GT") obs.counts = count.genotypes(my.gens) # Using my new function to get the counts my.results[i, 1:3] = c(obs.counts["AA"], obs.counts["Aa"], obs.counts["aa"]) # Fill in the observed counts (the first 3 columns of the my.results table) for the ith row obs.freq = allele.freq(obs.counts) # Use the other new function to get frequency my.results[i,4:5] = obs.freq # Fill in the next 2 columns (4 and 5) with the p and q values for the ith row my.results$N[i] = (sum(obs.counts)) - obs.counts["NN"] # Get the total of the Non-Missing values my.results$expAA[i] = (my.results$p[i]**2) * my.results$N[i] # Calculate expAA as p-squared times N (for the ith row) my.results$expAa[i] = 2 * my.results$p[i] * my.results$q[i] * my.results$N[i] # expAa = 2pq times N my.results$expaa[i] = (my.results$q[i]**2) * my.results$N[i] # expaa = q-squared times N } ### EXERCISE 2: Use Fisher's Exact Test to see which sites differ significantly from HWE ### First, I create an empty Pvalue column and add it to my results table: Pvalue = blank.col my.results = cbind(my.results, Pvalue) ### Next, I run a for() loop on my results table and use the ### code given along with the exercise to run the Fishers test for (i in 1:nrow(my.results)) { a=as.integer(my.results$obsAA[i]) b=as.integer(my.results$obsAa[i]/2) c=as.integer(my.results$obsAa[i] - b) d=as.integer(my.results$obsaa[i]) x=matrix(c(a,b,c,d), nrow=2, ncol=2, byrow=TRUE) test=fisher.test(x) my.results$Pvalue[i]=test$p.value } ### EXERCISE 3: Check Loci with Significant Deviation from HWE ### Part A: How many loci deviate from HWE? sig.loci = which(my.results$Pvalue < 0.05) # This returns which row numbers have a pvalue less than 0.05 num.sig.loci = length(sig.loci) # To get the count I just take the length (you should get 44 loci) ### Part B: What might cause these loci to deviate? ### As shown in the exercises, I can figure out how ### loci deviate by calculating the difference between ### observed and expected heterozygosity my.results$Diff=my.results$obsAa-my.results$expAa ### To check if samples where obsAa is too low ### have low read depth, I use the get.fields ### function from week 2 solutions to get the read ### depth for every sample my.results$rdDepth=rep(0,nrow(my.results)) for (i in 1:nrow(my.data)) { d=get.field(as.vector(my.data[i,10:ncol(my.data)], mode="character"), my.data[i,9], "DP") d[d=="."]=NA # Get rid of missing data characters so that we can treat the vector as numeric my.results$rdDepth[i]=mean(as.numeric(d), na.rm=TRUE) } ### Now, I'll use subset() to look at loci ### where: 1) pvalue < 0.05, 2) Diff < 0 (obs Aa too low) sig.results = my.results[sig.loci,] # this uses the results from which() that I got earlier to get the counts, and returns a table with only those rows too.low = subset(sig.results, sig.results$Diff<0) ### When you print the "too.low" table, you should see 5 rows, and avg. ### read depth for all of them is around 1.5, so definitely too low! ### Next, let's look at samples where obs(Aa) is too high ### Notice that I can subset the original my.data table using ### a condition that I test in a different table (the results) ### this is because their rows line up (i.e. row 1 of my.results ### corresponds to a calculation done on row 1 of my.data, and so on...) too.high = my.data[which((my.results$Pvalue<0.05) & (my.results$Diff>0)),] # I use which to test 2 conditions, and get only rows where both conditions are met ### To look at the results more easily, I'll only print the first ### 2 columns, since that is where the position info. is too.high[,1:2] ### Notice that the rows at the end (row # 990-999) are ### 10 positions all right next to each other - this definitely ### jumps out as a possible paralog. The rest of the sites are less ### obvious. A few sections (15-18, 415-418) are a bit suspect, but ### the others don't jump out as mapping errors