## ----global_options, include=FALSE--------------------------------------------
knitr::opts_chunk$set(fig.pos = 'H', fig.align = "center", warning = FALSE, message = FALSE)

## ----eval=FALSE---------------------------------------------------------------
#  if (!requireNamespace("BiocManager", quietly = TRUE))
#      install.packages("BiocManager")
#  
#  BiocManager::install("GBScleanR")

## ----eval=FALSE---------------------------------------------------------------
#  if (!requireNamespace("devtools", quietly = TRUE))
#      install.packages("devtools")
#  devtools::install_github("tomoyukif/GBScleanR", build_vignettes = TRUE)

## ----warning=FALSE, message=FALSE---------------------------------------------
library("GBScleanR")

## -----------------------------------------------------------------------------
vcf_fn <- system.file("extdata", "sample.vcf", package = "GBScleanR")
gds_fn <- tempfile("sample", fileext = ".gds")

## -----------------------------------------------------------------------------
# `force = TRUE` allow the function to over write the GDS file,
# even if a GDS file exists at `out_fn`.
gbsrVCF2GDS(vcf_fn = vcf_fn, out_fn = gds_fn, force = TRUE, verbose = FALSE)

## -----------------------------------------------------------------------------
gds <- loadGDS(gds_fn, verbose = FALSE)

## -----------------------------------------------------------------------------
# Number of samples
nscan(gds)

## -----------------------------------------------------------------------------
# Number of SNPs
nsnp(gds) 

## -----------------------------------------------------------------------------
# Indices of chromosome ID of all markers
head(getChromosome(gds)) 

## -----------------------------------------------------------------------------
# Chromosome names of all markers
head(getChromosome(gds, name = TRUE)) 

## -----------------------------------------------------------------------------
# Unique set of chromosome names
getChromosome(gds, levels = TRUE) 

## -----------------------------------------------------------------------------
# Position (bp) of all markers
head(getPosition(gds)) 

## -----------------------------------------------------------------------------
# Reference allele of all markers
head(getAlleleA(gds)) 

## -----------------------------------------------------------------------------
# Alternative allele of all markers
head(getAlleleB(gds)) 

## -----------------------------------------------------------------------------
# SNP IDs
head(getSnpID(gds)) 

## -----------------------------------------------------------------------------
# sample IDs
head(getScanID(gds)) 

## -----------------------------------------------------------------------------
geno <- getGenotype(gds)

## -----------------------------------------------------------------------------
geno <- getRead(gds)

## -----------------------------------------------------------------------------
gds <- countGenotype(gds)
gds <- countRead(gds)

## ----fig.alt="Missing rate per marker and per sample."------------------------
# Histgrams of missing rate
histGBSR(gds, stats = "missing") 

## ----fig.alt="Heterozygosity per marker and per sample."----------------------
# Histgrams of heterozygosity
histGBSR(gds, stats = "het") 

## ----fig.alt="Reference allele frequency per marker and per sample."----------
# Histgrams of reference allele frequency
histGBSR(gds, stats = "raf") 

## ----fig.alt="Total read depth per marker and per sample."--------------------
# Histgrams of total read depth
histGBSR(gds, stats = "dp") 

## ----fig.alt="Reference read depth per marker and per sample."----------------
# Histgrams of allelic read depth
histGBSR(gds, stats = "ad_ref") 

## ----fig.alt="Alternative read depth per marker and per sample."--------------
# Histgrams of allelic read depth
histGBSR(gds, stats = "ad_ref") 

## ----fig.alt="Reference read per marker and per sample."----------------------
# Histgrams of reference allele frequency
histGBSR(gds, stats = "rrf") 

## -----------------------------------------------------------------------------
gds <- calcReadStats(gds, q = 0.5)

## ----fig.alt="Mean of reference read depth per marker and per sample."--------
# Histgrams of mean allelic read depth
histGBSR(gds, stats = "mean_ref") 

## ----fig.alt="Mean of alternative read depth per marker and per sample."------
# Histgrams of mean allelic read depth
histGBSR(gds, stats = "mean_ref") 

## ----fig.alt="SD of reference read depth per marker and per sample."----------
# Histgrams of standard deviation of read depth
histGBSR(gds, stats = "sd_ref") 

## ----fig.alt="SD of alternative read depth per marker and per sample."--------
# Histgrams of standard deviation of read depth
histGBSR(gds, stats = "sd_ref") 

## ----fig.alt="Quantile of reference read depth per marker and per sample."----
# Histgrams of quantile of read depth
histGBSR(gds, stats = "qtile_ref", q = 0.5) 

## ----fig.alt="Quantile of alternative read depth per marker and per sample."----
# Histgrams of quantile of read depth
histGBSR(gds, stats = "qtile_ref", q = 0.5) 

## -----------------------------------------------------------------------------
plotGBSR(gds, stats = "missing")

## -----------------------------------------------------------------------------
plotGBSR(gds, stats = "geno")

## -----------------------------------------------------------------------------
pairsGBSR(gds, stats1 = "missing", stats2 = "dp")

## -----------------------------------------------------------------------------
# Reference genotype count per marker
head(getCountGenoRef(gds, target = "snp")) 
# Reference genotype count per sample
head(getCountGenoRef(gds, target = "scan")) 

## -----------------------------------------------------------------------------
# Heterozygote count per marker
head(getCountGenoHet(gds, target = "snp")) 
# Heterozygote count per sample
head(getCountGenoHet(gds, target = "scan")) 

## -----------------------------------------------------------------------------
# Alternative genotype count per marker
head(getCountGenoAlt(gds, target = "snp")) 
# Alternative genotype count per sample
head(getCountGenoAlt(gds, target = "scan"))

## -----------------------------------------------------------------------------
# Missing count per marker
head(getCountGenoMissing(gds, target = "snp")) 
# Missing count per sample
head(getCountGenoMissing(gds, target = "scan")) 

## -----------------------------------------------------------------------------
# Reference allele count per marker
head(getCountAlleleRef(gds, target = "snp")) 
# Reference allele count per sample
head(getCountAlleleRef(gds, target = "scan")) 

## -----------------------------------------------------------------------------
# Alternative allele count per marker
head(getCountAlleleAlt(gds, target = "snp")) 
# Alternative allele count per sample
head(getCountAlleleAlt(gds, target = "scan")) 

## -----------------------------------------------------------------------------
# Missing allele count per marker
head(getCountAlleleMissing(gds, target = "snp")) 
# Missing allele count per sample
head(getCountAlleleMissing(gds, target = "scan")) 

## -----------------------------------------------------------------------------
# Reference read count per marker
head(getCountReadRef(gds, target = "snp")) 
# Reference read count per sample
head(getCountReadRef(gds, target = "scan")) 

## -----------------------------------------------------------------------------
# Alternative read count per marker
head(getCountReadAlt(gds, target = "snp")) 
# Alternative read count per sample
head(getCountReadAlt(gds, target = "scan")) 

## -----------------------------------------------------------------------------
# Sum of reference and alternative read counts per marker
head(getCountRead(gds, target = "snp")) 
# Sum of reference and alternative read counts per sample
head(getCountRead(gds, target = "scan")) 

## -----------------------------------------------------------------------------
# Mean of reference allele read count per marker
head(getMeanReadRef(gds, target = "snp")) 
# Mean of reference allele read count per sample
head(getMeanReadRef(gds, target = "scan"))

## -----------------------------------------------------------------------------
# Mean of Alternative allele read count per marker
head(getMeanReadAlt(gds, target = "snp")) 
# Mean of Alternative allele read count per sample
head(getMeanReadAlt(gds, target = "scan")) 

## -----------------------------------------------------------------------------
# SD of reference allele read count per marker
head(getSDReadRef(gds, target = "snp")) 
# SD of reference allele read count per sample
head(getSDReadRef(gds, target = "scan")) 

## -----------------------------------------------------------------------------
# SD of Alternative allele read count per marker
head(getSDReadAlt(gds, target = "snp")) 
# SD of Alternative allele read count per sample
head(getSDReadAlt(gds, target = "scan"))

## -----------------------------------------------------------------------------
# Quantile of reference allele read count per marker
head(getQtileReadRef(gds, target = "snp", q = 0.5)) 
# Quantile of reference allele read count per sample
head(getQtileReadRef(gds, target = "scan", q = 0.5))

## -----------------------------------------------------------------------------
# Quantile of Alternative allele read count per marker
head(getQtileReadAlt(gds, target = "snp", q = 0.5))
# Quantile of Alternative allele read count per sample
head(getQtileReadAlt(gds, target = "scan", q = 0.5)) 

## -----------------------------------------------------------------------------
# Minor allele frequency per marker
head(getMAF(gds, target = "snp")) 
# Minor allele frequency per sample
head(getMAF(gds, target = "scan")) 

## -----------------------------------------------------------------------------
# Minor allele count per marker
head(getMAC(gds, target = "snp")) 
# Minor allele count per sample
head(getMAC(gds, target = "scan")) 

## -----------------------------------------------------------------------------
head(getCountGenoRef(gds, target = "snp", prop = TRUE))
head(getCountGenoHet(gds, target = "snp", prop = TRUE))
head(getCountGenoAlt(gds, target = "snp", prop = TRUE))
head(getCountGenoMissing(gds, target = "snp", prop = TRUE))

## -----------------------------------------------------------------------------
head(getCountAlleleRef(gds, target = "snp", prop = TRUE))
head(getCountAlleleAlt(gds, target = "snp", prop = TRUE))
head(getCountAlleleMissing(gds, target = "snp", prop = TRUE))

## -----------------------------------------------------------------------------
head(getCountReadRef(gds, target = "snp", prop = TRUE))
head(getCountReadAlt(gds, target = "snp", prop = TRUE))

## ----eval=FALSE---------------------------------------------------------------
#  gds <- setSnpFilter(missing = 0.2, het = c(0.1, 0.9), maf = 0.05)
#  gds <- setScanFilter(missing = 0.8, het = c(0.25, 0.75))

## ----eval=FALSE---------------------------------------------------------------
#  gds <- setCallFilter(gds, dp_count = c(5, Inf))

## ----eval=FALSE---------------------------------------------------------------
#  # Filtering genotype calls based on total read counts
#  gds <- setCallFilter(gds, norm_dp_count = c(0, 1000))
#  # Filtering genotype calls based on reference read counts
#  # and alternative read counts separately.
#  gds <- setCallFilter(gds, norm_ref_count = c(0, 1000),
#                         norm_alt_count = c(0, 800))

## -----------------------------------------------------------------------------
gds <- setCallFilter(gds, dp_count = c(5, Inf))
gds <- setSnpFilter(gds, missing = 0.2)

## -----------------------------------------------------------------------------
# Here we select only one marker from each 150 bp stretch.
gds <- thinMarker(gds, range = 150) 

## -----------------------------------------------------------------------------
gds <- countGenotype(gds)
gds <- countRead(gds)
gds <- calcReadStats(gds)

## -----------------------------------------------------------------------------
head(getValidSnp(gds))
head(getValidScan(gds))

## -----------------------------------------------------------------------------
nsnp(gds)
nsnp(gds, valid = FALSE)

## -----------------------------------------------------------------------------
subset_fn <- tempfile("sample_subset", fileext = ".gds")
subset_gds <- subsetGDS(gds, out_fn = subset_fn)

## ----eval = FALSE-------------------------------------------------------------
#  subset_gds <- loadGDS(subset_fn, verbose = FALSE)

## -----------------------------------------------------------------------------
closeGDS(subset_gds)
subset_gds <- loadGDS(subset_fn, verbose = FALSE)
# If you give a GbsrGenotypeData object to `loadGDS()`,
# the function try to reload the GDS file.
subset_gds <- loadGDS(subset_gds, verbose = FALSE)

## -----------------------------------------------------------------------------
gds <- openGDS(gds)

## -----------------------------------------------------------------------------
# Reset the filter on markers
gds <- resetSnpFilters(gds) 
# Reset the filter on samples
gds <- resetScanFilters(gds) 
# Reset the filter on calls
gds <- setRawGenotype(gds) 
# Reset all filters
gds <- resetFilters(gds) 

## -----------------------------------------------------------------------------
p1 <- grep("Founder1", getScanID(gds), value = TRUE)
p2 <- grep("Founder2", getScanID(gds), value = TRUE)
gds <- setParents(gds, parents = c(p1, p2), flip = TRUE, mono = TRUE, bi = TRUE)

## -----------------------------------------------------------------------------
gds <- countGenotype(gds)

## -----------------------------------------------------------------------------
histGBSR(gds, stats = "missing")

## -----------------------------------------------------------------------------
histGBSR(gds, stats = "het")

## -----------------------------------------------------------------------------
histGBSR(gds, stats = "raf")

## ----eval=FALSE---------------------------------------------------------------
#  # filter out markers with reference allele frequency
#  # less than 5% or more than 95%.
#  gds <- setSnpFilter(gds, maf = 0.05)

## ----eval=FALSE---------------------------------------------------------------
#  # Filter out samples with more than 90% missing genotype calls,
#  # less than 5% heterozygosity, and less than 5% minor allele frequency.
#  gds <- setScanFilter(gds, missing = 0.9, het = 0.05, maf = 0.05)

## -----------------------------------------------------------------------------
# Filter out genotype calls supported by reads less than 2 reads.
gds <- setCallFilter(gds, dp_count = c(2, Inf))
# Filter out genotype calls supported by reads more than 100.
gds <- setCallFilter(gds, dp_count = c(0, 100))
# Filter out genotype calls based on quantile values 
# of read counts at markers in each sample.
gds <- setCallFilter(gds, scan_ref_qtile = c(0, 0.9), scan_alt_qtile = c(0, 0.9))

## -----------------------------------------------------------------------------
# Remove markers having more than 75% of missing genotype calls
gds <- setSnpFilter(gds, missing = 0.2) 
nsnp(gds)

## -----------------------------------------------------------------------------
gds <- countGenotype(gds, node = "filt")

## -----------------------------------------------------------------------------
histGBSR(gds, stats = "missing")

## -----------------------------------------------------------------------------
histGBSR(gds, stats = "het")

## -----------------------------------------------------------------------------
histGBSR(gds, stats = "raf")

## -----------------------------------------------------------------------------
plotGBSR(gds, stats = "raf")

## -----------------------------------------------------------------------------
gds <- setSnpFilter(gds, maf = 0.25)
nsnp(gds)

## -----------------------------------------------------------------------------
gds <- countGenotype(gds)
histGBSR(gds, stats = "missing")

## -----------------------------------------------------------------------------
histGBSR(gds, stats = "het")

## -----------------------------------------------------------------------------
histGBSR(gds, stats = "raf")

## -----------------------------------------------------------------------------
# Marker density
plotGBSR(gds, stats = "marker")

## -----------------------------------------------------------------------------
plotGBSR(gds, stats = "geno")

## -----------------------------------------------------------------------------
gds <- initScheme(gds, crosstype = "pairing", mating = matrix(1:2, 2))
gds <- addScheme(gds, crosstype = "selfing")

## -----------------------------------------------------------------------------
getParents(gds)

## ----eval=FALSE---------------------------------------------------------------
#  gds <- initScheme(gds, crosstype = "pair", mating = cbind(c(1:2), c(3:4), c(5:6), c(7:8)))

## ----eval=FALSE---------------------------------------------------------------
#  showScheme(gds)

## ----eval=FALSE---------------------------------------------------------------
#  gds <- addScheme(gds, crosstype = "pair", mating = cbind(c(9:10), c(11:12)))
#  
#  # Check IDs.
#  showScheme(gds)

## ----eval=FALSE---------------------------------------------------------------
#  gds <- addScheme(gds, crosstype = "pair", mating = cbind(c(13:14)))
#  
#  #' # Check IDs.
#  showScheme(gds)

## ----eval=FALSE---------------------------------------------------------------
#  # Inbreeding by five times selfing.
#  gds <- addScheme(gds, crosstype = "self")
#  gds <- addScheme(gds, crosstype = "self")
#  gds <- addScheme(gds, crosstype = "self")
#  gds <- addScheme(gds, crosstype = "self")
#  gds <- addScheme(gds, crosstype = "self")

## ----message=FALSE------------------------------------------------------------
gds <- estGeno(gds, iter = 4)

## ----eval=FALSE---------------------------------------------------------------
#  gds <- estGeno(gds, het_parent = TRUE, iter = 4)

## ----eval=FALSE---------------------------------------------------------------
#  # Following codes do the same.
#  gds <- estGeno(gds, iter = 1)
#  gds <- estGeno(gds, optim = FALSE)

## -----------------------------------------------------------------------------
est_geno <- getGenotype(gds, node = "cor")

## -----------------------------------------------------------------------------
founder_geno <- getGenotype(gds, node = "parents")

## -----------------------------------------------------------------------------
est_hap <- getHaplotype(gds)

## -----------------------------------------------------------------------------
out_fn <- tempfile("sample_est", fileext = ".vcf.gz")
gbsrGDS2VCF(gds, out_fn)

## -----------------------------------------------------------------------------
gds <- openGDS(gds)

## -----------------------------------------------------------------------------
closeGDS(gds)
closeGDS(subset_gds)

## -----------------------------------------------------------------------------
sessionInfo()

