Oly Genetics 101

Oly-Genetics-NF-Testing

Oly Genetics Analysis, playing around with data

Today I played with the 2016/2017 Oly microsatellite data in R, generating some summary statistics by following a few online tutorials. What I’ve learned: * The spreadhseet “Olympic Oyster NFH_NFW (1).xlsx” from Crystal houses data from NF population (Fidalgo Bay) from two populations: 2016 hatchery-produced (F1), and 2017 wild. Crystal already determined bins, rounded the microsatellite data to produce: 100 samples with diploid genotype data across 6 loci (2 alleles per loci).
* microsatellite data is often in a particular spreadsheet format, whith metadata in the first 2 columns, and summary data in the first 2 rows. I exported the rounded data from Crystal’s spreadsheet, modified to conform with the required format, and saved as 3 separate files: 1. NFH-2016 alone 2. NFW-2017 alone 3. NFH-2016 & NFW-2017 merged * NOTE: There are additional microsatellite data from 2015, available online from Rick/Andy: https://www.dropbox.com/sh/0tz6a4f8tz8rwap/AACXhjgrZc2UWYSAk4N-uoM6a?dl=0

My github repo: https://github.com/laurahspencer/O.lurida_genetics

# Import 2016/2017 microsatellite data
# Used the following reference: https://grunwaldlab.github.io/Population_Genetics_in_R/TOC.html

NFH.2016 <- read.genalex("Data/Oly2016NFH_Rounded.csv", ploidy=2) #read 
NFW.2017 <- read.genalex("Data/Oly2017NFW_Rounded.csv", ploidy=2)
NF <- read.genalex("Data/Oly2016NFH+2017NFW_Merged.csv", ploidy=2)
summary(NFH.2016)
## 
## // Number of individuals: 100
## // Group sizes: 100
## // Number of alleles per locus: 31 14 14 17 18 18
## // Number of alleles per group: 112
## // Percentage of missing data: 3 %
## // Observed heterozygosity: 0.93 0.71 0.95 0.94 0.94 0.93
## // Expected heterozygosity: 0.94 0.75 0.86 0.92 0.92 0.92
summary(NFW.2017) #summary of wild samples
## 
## // Number of individuals: 99
## // Group sizes: 99
## // Number of alleles per locus: 30 13 15 19 20 19
## // Number of alleles per group: 116
## // Percentage of missing data: 1.85 %
## // Observed heterozygosity: 0.98 0.71 0.88 0.93 0.94 0.93
## // Expected heterozygosity: 0.94 0.74 0.87 0.93 0.93 0.93
summary(NF) #summary of hatchery and wild combined 
## 
## // Number of individuals: 199
## // Group sizes: 99 100
## // Number of alleles per locus: 34 15 17 19 20 19
## // Number of alleles per group: 116 112
## // Percentage of missing data: 2.43 %
## // Observed heterozygosity: 0.95 0.71 0.91 0.93 0.94 0.93
## // Expected heterozygosity: 0.95 0.75 0.87 0.93 0.93 0.93
info_table(NF, type="missing", plot=TRUE) #see how the missing data is distributed over the 2 populations

##           Locus
## Population Olur10 Olur11 Olur12 Olur13 Olur15 Olur19  Mean
##   NFW-2017  0.020  0.040  0.010  0.010  0.010  0.020 0.019
##   NFH-2016  0.020  0.060      .  0.040  0.020  0.040 0.030
##   Total     0.020  0.050  0.005  0.025  0.015  0.030 0.024
mlg.table(NF) #genotype eveness. Result is N=199; MLG=199
## Warning: package 'bindrcpp' was built under R version 3.3.2

##          MLG.1 MLG.2 MLG.3 MLG.4 MLG.5 MLG.6 MLG.7 MLG.8 MLG.9 MLG.10
## NFW-2017     0     0     1     1     1     0     0     1     0      0
## NFH-2016     1     1     0     0     0     1     1     0     1      1
##          MLG.11 MLG.12 MLG.13 MLG.14 MLG.15 MLG.16 MLG.17 MLG.18 MLG.19
## NFW-2017      0      0      0      1      1      0      1      1      0
## NFH-2016      1      1      1      0      0      1      0      0      1
##          MLG.20 MLG.21 MLG.22 MLG.23 MLG.24 MLG.25 MLG.26 MLG.27 MLG.28
## NFW-2017      1      0      1      0      0      1      0      1      1
## NFH-2016      0      1      0      1      1      0      1      0      0
##          MLG.29 MLG.30 MLG.31 MLG.32 MLG.33 MLG.34 MLG.35 MLG.36 MLG.37
## NFW-2017      1      1      0      1      1      0      1      0      1
## NFH-2016      0      0      1      0      0      1      0      1      0
##          MLG.38 MLG.39 MLG.40 MLG.41 MLG.42 MLG.43 MLG.44 MLG.45 MLG.46
## NFW-2017      1      0      1      1      0      0      0      1      1
## NFH-2016      0      1      0      0      1      1      1      0      0
##          MLG.47 MLG.48 MLG.49 MLG.50 MLG.51 MLG.52 MLG.53 MLG.54 MLG.55
## NFW-2017      0      1      0      1      0      1      1      0      0
## NFH-2016      1      0      1      0      1      0      0      1      1
##          MLG.56 MLG.57 MLG.58 MLG.59 MLG.60 MLG.61 MLG.62 MLG.63 MLG.64
## NFW-2017      0      0      1      0      0      1      0      0      0
## NFH-2016      1      1      0      1      1      0      1      1      1
##          MLG.65 MLG.66 MLG.67 MLG.68 MLG.69 MLG.70 MLG.71 MLG.72 MLG.73
## NFW-2017      0      1      1      0      0      0      0      1      1
## NFH-2016      1      0      0      1      1      1      1      0      0
##          MLG.74 MLG.75 MLG.76 MLG.77 MLG.78 MLG.79 MLG.80 MLG.81 MLG.82
## NFW-2017      1      1      1      1      0      0      1      0      1
## NFH-2016      0      0      0      0      1      1      0      1      0
##          MLG.83 MLG.84 MLG.85 MLG.86 MLG.87 MLG.88 MLG.89 MLG.90 MLG.91
## NFW-2017      1      0      1      1      0      0      1      0      0
## NFH-2016      0      1      0      0      1      1      0      1      1
##          MLG.92 MLG.93 MLG.94 MLG.95 MLG.96 MLG.97 MLG.98 MLG.99 MLG.100
## NFW-2017      1      0      0      1      0      0      1      1       1
## NFH-2016      0      1      1      0      1      1      0      0       0
##          MLG.101 MLG.102 MLG.103 MLG.104 MLG.105 MLG.106 MLG.107 MLG.108
## NFW-2017       0       1       1       1       1       0       0       1
## NFH-2016       1       0       0       0       0       1       1       0
##          MLG.109 MLG.110 MLG.111 MLG.112 MLG.113 MLG.114 MLG.115 MLG.116
## NFW-2017       1       1       1       0       0       0       1       0
## NFH-2016       0       0       0       1       1       1       0       1
##          MLG.117 MLG.118 MLG.119 MLG.120 MLG.121 MLG.122 MLG.123 MLG.124
## NFW-2017       0       1       0       1       0       1       1       1
## NFH-2016       1       0       1       0       1       0       0       0
##          MLG.125 MLG.126 MLG.127 MLG.128 MLG.129 MLG.130 MLG.131 MLG.132
## NFW-2017       1       0       1       0       0       0       0       0
## NFH-2016       0       1       0       1       1       1       1       1
##          MLG.133 MLG.134 MLG.135 MLG.136 MLG.137 MLG.138 MLG.139 MLG.140
## NFW-2017       1       0       0       0       0       1       0       0
## NFH-2016       0       1       1       1       1       0       1       1
##          MLG.141 MLG.142 MLG.143 MLG.144 MLG.145 MLG.146 MLG.147 MLG.148
## NFW-2017       1       1       0       0       0       0       1       0
## NFH-2016       0       0       1       1       1       1       0       1
##          MLG.149 MLG.150 MLG.151 MLG.152 MLG.153 MLG.154 MLG.155 MLG.156
## NFW-2017       1       0       1       1       1       0       1       1
## NFH-2016       0       1       0       0       0       1       0       0
##          MLG.157 MLG.158 MLG.159 MLG.160 MLG.161 MLG.162 MLG.163 MLG.164
## NFW-2017       0       1       1       0       0       1       0       1
## NFH-2016       1       0       0       1       1       0       1       0
##          MLG.165 MLG.166 MLG.167 MLG.168 MLG.169 MLG.170 MLG.171 MLG.172
## NFW-2017       1       1       0       1       1       1       1       0
## NFH-2016       0       0       1       0       0       0       0       1
##          MLG.173 MLG.174 MLG.175 MLG.176 MLG.177 MLG.178 MLG.179 MLG.180
## NFW-2017       0       0       0       0       1       1       0       0
## NFH-2016       1       1       1       1       0       0       1       1
##          MLG.181 MLG.182 MLG.183 MLG.184 MLG.185 MLG.186 MLG.187 MLG.188
## NFW-2017       1       1       1       1       1       0       0       1
## NFH-2016       0       0       0       0       0       1       1       0
##          MLG.189 MLG.190 MLG.191 MLG.192 MLG.193 MLG.194 MLG.195 MLG.196
## NFW-2017       1       0       1       0       0       0       1       0
## NFH-2016       0       1       0       1       1       1       0       1
##          MLG.197 MLG.198 MLG.199
## NFW-2017       0       1       1
## NFH-2016       1       0       0

Generate more summary statistics using poppr function

Abbreviations used in t Statistic
* Pop: Population name.
* N: Number of individuals observed.
* MLG: Number of multilocus genotypes (MLG) observed.
* eMLG: The number of expected MLG at the smallest sample size ≥ 10 based on rarefaction
* SE: Standard error based on eMLG.
* H: Shannon-Wiener Index of MLG diversity (Shannon, 2001).
* G: Stoddart and Taylor’s Index of MLG diversity (Stoddart & Taylor, 1988).
* lambda: Simpson’s Index (Simpson, 1949). 0 = no genotypes are differet; 1 = all genotypes are different
* E.5: Evenness, E5E5 (Pielou, 1975; Ludwig & Reynolds, 1988; Grünwald et al., 2003).
* Hexp: Nei’s unbiased gene diversity (Nei, 1978).
* Ia: The index of association, IAIA (Brown, Feldman & Nevo, 1980; Smith et al., 1993).
* rbarD: The standardized index of association, r¯dr¯d [@].

NF.pop <- poppr(NF) #summary stats on each population
NF.pop
##        Pop   N MLG eMLG       SE    H   G lambda E.5  Hexp    Ia rbarD
## 1 NFW-2017  99  99   99 0.00e+00 4.60  99  0.990   1 0.894 0.611 0.124
## 2 NFH-2016 100 100   99 2.58e-08 4.61 100  0.990   1 0.891 0.590 0.120
## 3    Total 199 199   99 0.00e+00 5.29 199  0.995   1 0.893 0.605 0.122
##   File
## 1   NF
## 2   NF
## 3   NF
(NF.pop$N / (NF.pop$N - 1)) * NF.pop$lambda #corrected simpson's index (N/(N-1)) #all different genotypes
## [1] 1 1 1

Are our populations in Hardy-Weinberg equilibrium?

Hardy‐Weinberg Assumptions include: • infinite population • discrete generations • random mating • no selection • no migration in or out of population • no mutation • equal initial genotype frequencies in the two sexes Equilibrium is reached after one generation of mating under the Hardy‐Weinberg assumptions…Genotype frequencies remain the same from generation to generation.

library("pegas")
## Warning: package 'pegas' was built under R version 3.3.2
## Loading required package: ape
## Warning: package 'ape' was built under R version 3.3.2
## 
## Attaching package: 'pegas'
## The following object is masked from 'package:ape':
## 
##     mst
## The following object is masked from 'package:ade4':
## 
##     amova
NF.HW <- seppop(NF) %>% lapply(hw.test, B=1000) #all P-values >0.05; reject the null that these populations are under HWE
NF.HW
## $`NFW-2017`
##            chi^2  df Pr(chi^2 >) Pr.exact
## Olur10 447.58545 435   0.3280911    0.285
## Olur11  48.88706  78   0.9959968    0.789
## Olur12  70.89750 105   0.9956481    0.918
## Olur13 166.41974 171   0.5846400    0.319
## Olur15 191.69997 190   0.4517938    0.315
## Olur19 165.39521 171   0.6065314    0.325
## 
## $`NFH-2016`
##            chi^2  df Pr(chi^2 >) Pr.exact
## Olur10 417.06735 465   0.9459973    0.615
## Olur11  58.00523  91   0.9972368    0.806
## Olur12  67.18583  91   0.9711158    0.822
## Olur13 137.46193 136   0.4487866    0.130
## Olur15 159.88890 153   0.3350415    0.085
## Olur19 159.56688 153   0.3415811    0.101

Hardy-Weinberg test results: reject the null that these populations are under HWE for all 6 loci. Here is a table with p-values for all loci, for each population:

NF.HW.P <- sapply(NF.HW, "[", i=TRUE, j=3) #pvalues of HW chi-squared test for all loci, both pops combined into a dataframe
NF.HW.P
##         NFW-2017  NFH-2016
## Olur10 0.3280911 0.9459973
## Olur11 0.9959968 0.9972368
## Olur12 0.9956481 0.9711158
## Olur13 0.5846400 0.4487866
## Olur15 0.4517938 0.3350415
## Olur19 0.6065314 0.3415811

Are populations in linkage disequilibrium?

Test the null hypothesis that alleles observed at different loci are not linked. This is the case if populations are sexual while alleles recombine freely into new genotypes during the process of sexual reproduction: IA =VO/VE -1 … where V0 is the observed variance of K and VE is the expected variance of K, where K is the number of loci at which two individuals differ.

library("magrittr")
NF.ia.H <- ia(popsub(NF, "NFH-2016"), sample=999)

NF.ia.H
##        Ia      p.Ia     rbarD      p.rD 
## 0.5900609 0.0010000 0.1195621 0.0010000
NF.ia.W <- ia(popsub(NF, "NFW-2017"), sample=999)

NF.ia.W
##        Ia      p.Ia     rbarD      p.rD 
## 0.6107414 0.0010000 0.1236043 0.0010000

Since P < 0.001, we find significant support for the hypothesis that alleles are linked across loci.

Let’s try to figure out which alleles are linked via pairwise assessment:

NF.W2017.pair <- pair.ia(popsub(NF, "NFW-2017"), quiet=F, plot=F)
NF.H2016.pair <- pair.ia(popsub(NF, "NFH-2016"), quiet=F, plot=F)
pair.range <- range(c(NF.W2017.pair, NF.H2016.pair), na.rm=TRUE)

Check out the pair.ia results for each population. From the below plots, it looks like loci 13, 15 & 19 are possibly linked

NF.W2017.pair
##                    Ia   rbarD
## Olur10:Olur11 -0.0104 -0.0107
## Olur10:Olur12  0.0109  0.0109
## Olur10:Olur13  0.0228  0.0228
## Olur10:Olur15  0.0143  0.0143
## Olur10:Olur19  0.0058  0.0058
## Olur11:Olur12 -0.0262 -0.0265
## Olur11:Olur13 -0.0372 -0.0386
## Olur11:Olur15 -0.0491 -0.0510
## Olur11:Olur19 -0.0453 -0.0461
## Olur12:Olur13 -0.0080 -0.0081
## Olur12:Olur15 -0.0040 -0.0041
## Olur12:Olur19  0.0072  0.0072
## Olur13:Olur15  0.7269  0.7269
## Olur13:Olur19  0.6486  0.6513
## Olur15:Olur19  0.8798  0.8833
plot(NF.W2017.pair, limits=pair.range, main="NFW-2017 Index of Association Pair Comparison")

NF.H2016.pair
##                    Ia   rbarD
## Olur10:Olur11  0.0114  0.0119
## Olur10:Olur12  0.0185  0.0185
## Olur10:Olur13  0.0081  0.0083
## Olur10:Olur15 -0.0075 -0.0075
## Olur10:Olur19 -0.0082 -0.0084
## Olur11:Olur12 -0.0496 -0.0511
## Olur11:Olur13 -0.0085 -0.0086
## Olur11:Olur15 -0.0196 -0.0201
## Olur11:Olur19  0.0133  0.0134
## Olur12:Olur13 -0.0095 -0.0096
## Olur12:Olur15 -0.0407 -0.0407
## Olur12:Olur19  0.0113  0.0114
## Olur13:Olur15  0.6340  0.6390
## Olur13:Olur19  0.5312  0.5312
## Olur15:Olur19  0.6484  0.6534
plot(NF.H2016.pair, limits=pair.range, main="NFH-2016 Index of Association Pair Comparison")

Review frequencies of each allele and plot frequency of hatchery vs. wild:

NF.freq <- rraf(NF, by_pop=TRUE)
NF.freq.t <- t(NF.freq)
NF.freq.t
##               NFW-2017    NFH-2016
## Olur10.214 0.149484536 0.127551020
## Olur10.238 0.067010309 0.107142857
## Olur10.304 0.025773196 0.030612245
## Olur10.210 0.015463918 0.010000000
## Olur10.276 0.061855670 0.025510204
## Olur10.282 0.036082474 0.045918367
## Olur10.286 0.046391753 0.020408163
## Olur10.202 0.015463918 0.010204082
## Olur10.240 0.051546392 0.071428571
## Olur10.250 0.030927835 0.035714286
## Olur10.288 0.020618557 0.035714286
## Olur10.246 0.010309278 0.030612245
## Olur10.256 0.015463918 0.005102041
## Olur10.292 0.046391753 0.040816327
## Olur10.220 0.010309278 0.010204082
## Olur10.280 0.020618557 0.030612245
## Olur10.232 0.056701031 0.040816327
## Olur10.244 0.025773196 0.025510204
## Olur10.226 0.030927835 0.010204082
## Olur10.274 0.067010309 0.066326531
## Olur10.270 0.025773196 0.025510204
## Olur10.234 0.030927835 0.030612245
## Olur10.298 0.025773196 0.030612245
## Olur10.300 0.030927835 0.010204082
## Olur10.306 0.010309278 0.010000000
## Olur10.228 0.020618557 0.020408163
## Olur10.294 0.020618557 0.045918367
## Olur10.268 0.005154639 0.015306122
## Olur10.252 0.020618557 0.010204082
## Olur10.318 0.005154639 0.010000000
## Olur10.316 0.010101010 0.005102041
## Olur10.310 0.010101010 0.015306122
## Olur10.216 0.010101010 0.010204082
## Olur10.222 0.010101010 0.010204082
## Olur11.140 0.178947368 0.196808511
## Olur11.162 0.389473684 0.388297872
## Olur11.144 0.273684211 0.234042553
## Olur11.139 0.010526316 0.010638298
## Olur11.152 0.042105263 0.026595745
## Olur11.158 0.036842105 0.010638298
## Olur11.136 0.010526316 0.010000000
## Olur11.171 0.010526316 0.021276596
## Olur11.135 0.005263158 0.005319149
## Olur11.156 0.010526316 0.026595745
## Olur11.150 0.005263158 0.005319149
## Olur11.176 0.005263158 0.005319149
## Olur11.164 0.021052632 0.047872340
## Olur11.174 0.010101010 0.010638298
## Olur11.168 0.010101010 0.010638298
## Olur12.250 0.061224490 0.035000000
## Olur12.266 0.193877551 0.200000000
## Olur12.178 0.168367347 0.210000000
## Olur12.268 0.091836735 0.120000000
## Olur12.254 0.168367347 0.155000000
## Olur12.262 0.056122449 0.065000000
## Olur12.256 0.051020408 0.060000000
## Olur12.260 0.096938776 0.075000000
## Olur12.252 0.010204082 0.010000000
## Olur12.182 0.040816327 0.020000000
## Olur12.272 0.040816327 0.030000000
## Olur12.284 0.005102041 0.005000000
## Olur12.170 0.005102041 0.010000000
## Olur12.240 0.005102041 0.010000000
## Olur12.188 0.005102041 0.010000000
## Olur12.274 0.010101010 0.005000000
## Olur12.248 0.010101010 0.010000000
## Olur13.253 0.071428571 0.015625000
## Olur13.257 0.122448980 0.078125000
## Olur13.261 0.051020408 0.078125000
## Olur13.277 0.086734694 0.135416667
## Olur13.237 0.045918367 0.046875000
## Olur13.289 0.076530612 0.057291667
## Olur13.273 0.066326531 0.062500000
## Olur13.293 0.051020408 0.114583333
## Olur13.241 0.035714286 0.031250000
## Olur13.249 0.071428571 0.078125000
## Olur13.285 0.102040816 0.072916667
## Olur13.245 0.020408163 0.046875000
## Olur13.281 0.081632653 0.093750000
## Olur13.265 0.045918367 0.041666667
## Olur13.269 0.035714286 0.010416667
## Olur13.301 0.020408163 0.020833333
## Olur13.309 0.005102041 0.010000000
## Olur13.305 0.005102041 0.010000000
## Olur13.297 0.005102041 0.015625000
## Olur15.175 0.076530612 0.015306122
## Olur15.177 0.122448980 0.076530612
## Olur15.181 0.045918367 0.076530612
## Olur15.197 0.086734694 0.132653061
## Olur15.157 0.045918367 0.035714286
## Olur15.209 0.076530612 0.056122449
## Olur15.193 0.066326531 0.061224490
## Olur15.213 0.045918367 0.112244898
## Olur15.161 0.035714286 0.030612245
## Olur15.169 0.071428571 0.076530612
## Olur15.205 0.102040816 0.081632653
## Olur15.165 0.020408163 0.045918367
## Olur15.201 0.081632653 0.096938776
## Olur15.185 0.045918367 0.045918367
## Olur15.189 0.035714286 0.010204082
## Olur15.221 0.020408163 0.025510204
## Olur15.229 0.005102041 0.005102041
## Olur15.225 0.005102041 0.010000000
## Olur15.217 0.005102041 0.015306122
## Olur15.151 0.005102041 0.010000000
## Olur19.225 0.072164948 0.015625000
## Olur19.229 0.123711340 0.072916667
## Olur19.233 0.051546392 0.078125000
## Olur19.249 0.087628866 0.135416667
## Olur19.209 0.041237113 0.046875000
## Olur19.261 0.072164948 0.057291667
## Olur19.245 0.067010309 0.062500000
## Olur19.265 0.051546392 0.109375000
## Olur19.213 0.036082474 0.026041667
## Olur19.221 0.072164948 0.072916667
## Olur19.257 0.103092784 0.083333333
## Olur19.217 0.020618557 0.046875000
## Olur19.253 0.082474227 0.098958333
## Olur19.237 0.046391753 0.041666667
## Olur19.241 0.036082474 0.010416667
## Olur19.273 0.020618557 0.020833333
## Olur19.281 0.005154639 0.005208333
## Olur19.277 0.005154639 0.010000000
## Olur19.269 0.005154639 0.015625000
plot(NF.freq.t)

Testing out another tutorial & R package to generate statistics

Source: http://popgen.nescent.org/startMicrosatellite.html

knitr::opts_chunk$set(library("adegenet"))
knitr::opts_chunk$set(library("pegas"))

First generate stats on NFW-2016 population & plot Observed vs. Expected Heterozygosity

NF.summary <- summary(NF)
NFW.2017.summary <- summary(popsub(NF, "NFW-2017"))
NFW.2017.summary
## 
## // Number of individuals: 99
## // Group sizes: 99
## // Number of alleles per locus: 30 13 15 19 20 19
## // Number of alleles per group: 116
## // Percentage of missing data: 1.85 %
## // Observed heterozygosity: 0.98 0.71 0.88 0.93 0.94 0.93
## // Expected heterozygosity: 0.94 0.74 0.87 0.93 0.93 0.93
plot(NFW.2017.summary$Hobs, xlab="Loci number", ylab="Observed Heterozygosity", 
     main="Observed heterozygosity per locus, NFW 2017")

plot(NFW.2017.summary$Hobs, NFW.2017.summary$Hexp, xlab="Observed Heterozygosity", ylab="Expected Heterozygosity", 
     main="Expected ~ Observed Heterozygosity per locus, NFW 2017")

bartlett.test(list(NFW.2017.summary$Hexp, NFW.2017.summary$Hobs)) #indicates no difference between mean observed and expected heterozygosity 
## 
##  Bartlett test of homogeneity of variances
## 
## data:  list(NFW.2017.summary$Hexp, NFW.2017.summary$Hobs)
## Bartlett's K-squared = 0.22016, df = 1, p-value = 0.6389

Run same stats on the NFH-2016 population

NFH.2016.summary <- summary(popsub(NF, "NFH-2016"))
NFH.2016.summary
## 
## // Number of individuals: 100
## // Group sizes: 100
## // Number of alleles per locus: 31 14 14 17 18 18
## // Number of alleles per group: 112
## // Percentage of missing data: 3 %
## // Observed heterozygosity: 0.93 0.71 0.95 0.94 0.94 0.93
## // Expected heterozygosity: 0.94 0.75 0.86 0.92 0.92 0.92
plot(NFH.2016.summary$Hobs, xlab="Loci number", ylab="Observed Heterozygosity", 
     main="Observed heterozygosity per locus, NFH 2016")

plot(NFH.2016.summary$Hobs, NFH.2016.summary$Hexp, xlab="Observed Heterozygosity", ylab="Expected Heterozygosity", 
     main="Expected ~ Observed Heterozygosity per locus, NFH 2016")

bartlett.test(list(NFH.2016.summary$Hexp, NFH.2016.summary$Hobs)) #indicates no difference between mean observed and expected heterozygosity 
## 
##  Bartlett test of homogeneity of variances
## 
## data:  list(NFH.2016.summary$Hexp, NFH.2016.summary$Hobs)
## Bartlett's K-squared = 0.26331, df = 1, p-value = 0.6079

Calculate and plot allelic frequency

Modified from the following script: http://www.molecularecologist.com/wp-content/uploads/2012/03/Allelefrequency_calculations2.txt

# I need the data as a normal dataframe, so I re-read the data into R as .csv
NF.csv <- read.csv(file="Data/Oly2016NFH+2017NFW_Merged.csv", stringsAsFactors = F)
names(NF.csv) <- NF.csv[2,]
NF.csv <- NF.csv[-1:-2,]
NFW.df <- subset(NF.csv, NF.csv$Population == "NFW-2017") #subset the wild pop
NFH.df <- subset(NF.csv, NF.csv$Population == "NFH-2016") #subset the hatchery pop
NFW.df.1 <- NFW.df[,-1:-2] #remove metadata 
NFH.df.1 <- NFH.df[,-1:-2] #remove metadata

Write a function to calculate allelic frequency and produce a dataframe with results

allelic.freq <- function(df) {
  L=ncol(df)   #how many columns are there?
  locus_positions=(2*(unique(round((1:(L-2))/2)))+1)   #find the starting column number for each locus
  lnames=colnames(df)                          #locus names, from the header
  OUT=list()        #create a null dataset to append allele freqs to
  
  for (x in locus_positions) {                       #begin for loop, to calculate frequencies for each locus
    alleles=c(df[,x],df[,x+1])        #For example, combine columns 1 and 2 for locus 1 (two columns because they are diploid)
    alleles2=as.data.frame(table(alleles))             #count each allele at locus x
    missing=alleles2[which(alleles2[,1]==0),2]          #count missing data at locus x, entered as '0' in this dataset (not used further for simplicity)
    alleles3=alleles2[which(!alleles2[,1]==0),]          #remove missing data (otherwise 0 would be counted in total number of alleles)
    alleles4=cbind(alleles3,alleles3[,2]/sum(alleles3[,2])) #calculate frequencies
    output=cbind(x,lnames[x],alleles4)                        #combine x, locusname, and frequencies
    OUT[[x]] <- output
  }  
  OUT.1 <- do.call(rbind, OUT)
  colnames(OUT.1) <- c("Number","Locus","allele","count","frequency") #add column headers
  return(OUT.1)
}

Run the allelic.freq function on thie hatchery and wild populations seperately, then save to txt files

NFH.afreq <- allelic.freq(NFW.df.1)[,-1]
NFW.afreq <- allelic.freq(NFH.df.1)[,-1]
write.table(NFH.afreq,file="Analyses/NFH-2016-Allelefrequencies.txt",row.names=FALSE,col.names=TRUE,sep="\t",append=FALSE)
write.table(NFW.afreq,file="Analyses/NFW-2017-Allelefrequencies.txt",row.names=FALSE,col.names=TRUE,sep="\t",append=FALSE)

Write a function to generate allelic frequency plots

Freq.Plots <- function(wanted_locus, frequency_table, population) {
  Locus=frequency_table[which(frequency_table[,1]==wanted_locus),]
  plot(as.numeric(as.character(Locus[,2])),as.numeric(as.character(Locus[,4])),xlab="Allele",ylab="Frequency",main=paste(population, "_", "Locus_",Locus[1,1]),pch=21,bg="blue",cex=1.5)
  plot(1:length(Locus[,2]),sort(as.numeric(as.character(Locus[,4])),decreasing=TRUE),xlab="Allele (orderd by frequency)",ylab="Frequency",main=paste(population, "_", "Locus_",Locus[1,1], sep=""),pch=21,bg="blue",cex=1.5)
}

Generate allelic frequency plots for each loci, for each population:

# Generate plots for NFH-2016 population
loci <- unique(NFH.afreq$Locus)
for (i in 1:length(loci)) {
  path <- file.path(paste("Analyses/", "FreqPlots_NFH-2016_", loci[i], ".pdf", sep = ""))
  pdf(file=path)
  Freq.Plots(loci[i], NFH.afreq, "NFH-2016")
  dev.off()
}
# Here are example plots: 
Freq.Plots("Olur19", NFH.afreq, "NFH-2016")

# Generate plots for NFW-2017 population
loci <- unique(NFW.afreq$Locus)
for (i in 1:length(loci)) {
  path <- file.path(paste("Analyses/", "FreqPlots_NFW-2017_", loci[i], ".pdf", sep = ""))
  pdf(file=path)
  Freq.Plots(loci[i], NFW.afreq, "NFW-2017")
  dev.off()
}
# Here are example plots: 
Freq.Plots("Olur19", NFW.afreq, "NFW-2017")

Written on January 17, 2018