## This README file explains the steps to download and freeze in this
## annotation package the phastCons conservation scores for human calculated
## on 100 vertebrate species. If you use these data please cite the following
## publication:

## Pollard KS, Hubisz MJ, Siepel A. Detection of non-neutral substitution rates
## on mammalian phylogenies. Genome Res. 2010 Jan;20(1):110-21.
## (http://genome.cshlp.org/content/early/2009/10/26/gr.097857.109.abstract)

## The data was downloaded from the UCSC genome browser with the Unix 'rsync'
## command as follows
##
## $ rsync -avz --progress \
##     rsync://hgdownload.cse.ucsc.edu/goldenPath/hg19/phastCons100way/hg19.100way.phastCons/ \
##     ./hg19.100way.phastCons

## The following R script process the downloaded data to
## store the phastCons scores in a single RleList object 

library(BSgenome.Hsapiens.UCSC.hg19)
library(rtracklayer)
library(doParallel)

registerDoParallel(cores=4) ## each process may need up to 20Gb of RAM

## transform WIG to BIGWIG format
si <- Seqinfo(seqnames=seqnames(Hsapiens), seqlengths=seqlengths(Hsapiens))
foreach (chr=seqnames(Hsapiens)) %dopar% {
  cat(chr, "\n")
  wigToBigWig(file.path("hg19.100way.phastCons", sprintf("%s.phastCons100way.wigFix.gz", chr)), seqinfo=si)
}

## transform BIGWIG into Rle objects coercing phastCons scores into
## 1-decimal digit raw-encoded values to reduce memory requirements
## in principle deciles of phastCons probabilities should give the
## necessary resolution for the purpose of filtering genetic variants
## on conservation

foreach (chr=seqnames(Hsapiens)) %dopar% {
  cat(chr, "\n")
  tryCatch({
    assign(sprintf("phastCons100way_%s", chr),
           10*round(import.bw(BigWigFile(file.path("hg19.100way.phastCons", sprintf("%s.phastCons100way.bw", chr))), asRle=TRUE), digits=1))
    assign(sprintf("phastCons100way_%s", chr),
           do.call("runValue<-", list(get(sprintf("phastCons100way_%s", chr))[[1]],
                                      as.raw(runValue(get(sprintf("phastCons100way_%s", chr))[[1]])))))
    save(list=sprintf("phastCons100way_%s", chr),
         file=file.path("hg19.100way.phastCons", sprintf("%s.phastCons100way.RData", chr)))
    rm(list=sprintf("phastCons100way_%s", chr))
    gc()
  }, error=function(err) {
      message(chr, " ", conditionMessage(err), call.=TRUE)
  })
}

## store all Rle phastCons objects into a single RleList object called 'phastCons100way'

phastCons100way <- RleList(mclapply(seqnames(Hsapiens),
                                      function(chr) {
  fname <- file.path("hg19.100way.phastCons", sprintf("%s.phastCons100way.RData", chr))
  pcrle <- Rle(raw())
  if (file.exists(fname)) {
    message(sprintf("Loading %s", fname))
    load(fname)
    pcrle <- get(sprintf("phastCons100way_%s", chr))
  } else message(sprintf("No Rle phastcons file for sequence %s", chr))
  pcrle
                                      }, mc.cores=4), compress=FALSE)
names(phastCons100way) <- seqnames(Hsapiens)
save(phastCons100way, file="phastCons100way.rda")

## freeze the GenomeDescription data for Hsapiens for later retrieval at loading time

refgenomeGD <- GenomeDescription(organism=organism(Hsapiens),
                                 species=species(Hsapiens),
                                 provider=provider(Hsapiens),
                                 provider_version=providerVersion(Hsapiens),
                                 release_date=releaseDate(Hsapiens),
                                 release_name=releaseName(Hsapiens),
                                 seqinfo=Seqinfo(seqnames=seqnames(Hsapiens),
                                                 seqlengths=seqlengths(Hsapiens),
                                                 isCircular=isCircular(Hsapiens),
                                                 genome=releaseName(Hsapiens)))

save(refgenomeGD, file="refgenomeGD.rda")
