## globally defined constants
MAXCHARVARID <- 32
MAXCHARCHROMNAMES <- 32
MAXCHARALLELE <- 20

freezeKnownVariantsKG <- function(sourceURL=MafDbKGdefaultURL, targetdir=getwd(), MafDbFilename=MafDbFilename,
                                  genome="hg19", yieldSize=1000000) {
  vcfFilename <- tempfile(fileext=".vcf.gz")
  tabFilename <- paste0(vcfFilename, ".tbi")

  message(sprintf("Downloading known variants from the 1000 Genomes project server through the VCF file located at:\n%s", sourceURL))
  err <- download.file(url=sourceURL, destfile=vcfFilename, method="curl")
  if (err)
    stop(sprintf("could not download file from %s. Please check that your internet connection is working, that this URL is correct and that there is sufficient disk space in %s to store the file.", sourceURL, tempdir()))
  else {
    tryCatch({
      vcfHeader <- scanVcfHeader(vcfFilename)
    }, error=function(err) {
      stop(err)
      ## stop(sprintf("VCF file from URL %s either does not exist on that server or it has not a valid VCF header.", sourceURL), call.=TRUE)
    })
  }

  err <- download.file(url=paste0(sourceURL, ".tbi"), destfile=tabFilename, method="curl")
  if (err)
    stop(sprintf("could not download file from %s. Please check that your internet connection is working, that this URL is correct and that there is sufficient disk space in %s to store the file.", paste0(sourceURL, ".tbi"), tempdir()))
  else {
    tryCatch({
      tabHeader <- headerTabix(vcfFilename)
    }, error=function(err) {
      stop(sprintf("Tabix file from URL %s either does not exist on that server or it is not a valid tabix file.", paste0(sourceURL, ".tbi")), call.=TRUE)
    })
  }

  AFcols <- rownames(info(vcfHeader))
  AFcols <- AFcols[c(which(AFcols == "AF"), grep("_AF", AFcols))]

  param <- ScanVcfParam(geno=NA,
                        fixed="ALT",
                        info=AFcols)
  tab <- TabixFile(vcfFilename, yieldSize=yieldSize)
  open(tab)

  conn <- dbConnect(SQLite(), dbname=file.path(targetdir, MafDbFilename))
  sql <- c("CREATE TABLE knownVariants (\n",
           sprintf("  varID VARCHAR(%d) NOT NULL,\n", MAXCHARVARID),
           sprintf("  chrom VARCHAR(%d) NOT NULL,\n", MAXCHARCHROMNAMES),
           sprintf("  %s CHAR(1),\n", AFcols),
           "  start INTEGER NOT NULL,\n",
           sprintf("  ref VARCHAR(%d) NOT NULL,\n", MAXCHARALLELE),
           "  wref INTEGER NOT NULL,\n",
           sprintf("  alt VARCHAR(%d) NOT NULL,\n", MAXCHARALLELE),
           "  walt INTEGER NOT NULL\n",
           ")")
  VariantFiltering:::.dbEasyQuery(conn, paste(sql, collapse=""))

  message("Starting to process known variants")
  nVar <- 0
  while (nrow(vcf <- readVcf(tab, genome=genome, param=param))) {
    rd <- rowData(vcf)
    knownVariants <- data.frame(varID=rownames(vcf),
                                chrom=as.character(seqnames(rd)),
                                start=start(rd),
                                ref=substring(as.character(rd$REF), 1, MAXCHARALLELE),
                                wref=width(rd$REF),
                                alt=substring(unstrsplit(as(rd$ALT, "CharacterList"), sep="/"), 1, MAXCHARALLELE),
                                walt=as.integer(sapply(nchar(rd$ALT), max)),
                                check.names=FALSE, stringsAsFactors=FALSE)
    mafValues <- as.data.frame(info(vcf))
    rownames(mafValues) <- NULL
    for (afCol in AFcols)
      mafValues[[afCol]] <- rawToChar(codeAF2RAW(sapply(mafValues[[afCol]], max)), multiple=TRUE)

    knownVariants <- cbind(knownVariants, mafValues, stringsAsFactors=FALSE)                      
    knownVariants <- knownVariants[, c("varID", "chrom", AFcols, "start", "ref", "wref", "alt", "walt")]
    sql <- sprintf("INSERT INTO knownVariants VALUES (%s)", paste(rep("?", ncol(knownVariants)), collapse=","))
    VariantFiltering:::.dbEasyPreparedQuery(conn, paste(sql, collapse=""), knownVariants)
    nVar <- nVar + nrow(knownVariants)
    message(sprintf("%d known variants from the 1000 Genomes Project processed", nVar))
  }

  thispkg_version <- installed.packages()['VariantFiltering', 'Version']
  rsqlite_version <- installed.packages()['RSQLite', 'Version']

  metadata <- data.frame(name=c("Db type", "Supporting package", "Data source", "Data source tag",
                                "Resource URL", "Number of variants", "Db created by", "Creation date",
                                "VariantFiltering version at creation time", "RSQLite version at creation time"),
                         value=c("MafDb", "VariantFiltering", "1000 Genomes Project", "KG", sourceURL,
                                 sprintf("%d", nVar), "VariantFiltering package", date(), thispkg_version,
                                 rsqlite_version))

  sql <- c("CREATE TABLE metadata (\n",
           "  name TEXT NOT NULL,\n",
           "  value TEXT NOT NULL)\n")
  VariantFiltering:::.dbEasyQuery(conn, paste(sql, collapse=""))
  sql <- sprintf("INSERT INTO metadata VALUES (%s)", paste(rep("?", ncol(metadata)), collapse=","))
  VariantFiltering:::.dbEasyPreparedQuery(conn, paste(sql, collapse=""), metadata)

  dbDisconnect(conn)

  file.path(targetdir, MafDbFilename)
}

## adapted from makeTxDbPackage() in GenomicFeatures/R/makeTxDbPackage.R
makeMafDbPackageKG <- function(destDir=path.expand("~"), MafDbURL=MafDbKGdefaultURL,
                               MafDbPkgName=MafDbKGdefaultPkgName, genome="hg19",
                               version=NULL, author=NULL, maintainer=NULL, license=NULL,
                               yieldSize=1000000) {

  thisPkg <- getAnywhere("makeMafDbPackageKG")
  thisPkgName <- sub("package:", "", thisPkg$where[grep("package", thisPkg$where)[1]])

  if (is.null(version)) { ## by default custom versions of MafDb.* have x.y.z version numbers with y being odd
    version <- packageDescription(thisPkgName)$Version
    x <- as.integer(gsub(".[0-9]+.[0-9]+$", "", version))
    y <- as.integer(gsub("[0-9]+$", "", substring(version, gregexpr("[0-9]+.[0-9]+$", version)[[1]])))
    z <- as.integer(substring(version, gregexpr("[0-9]+$", version)[[1]]))
    if (y %% 2 == 0) {
      y <- y + 1
      z <- 0
    } else
      z <- z + 1

    version <- paste(x, y, z, sep=".")
  }

  if (is.null(author))
    author <- packageDescription(thisPkgName)$Author

  if (is.null(maintainer))
    maintainer <- maintainer(thisPkgName)

  if (is.null(license))
    license <- packageDescription(thisPkgName)$License

  symvals <- list(PKGTITLE="Minimum allele frequency data from 1000 Genomes",
                  PKGDESCRIPTION="Minimum allele frequency data frozen from the 1000 Genomes Project",
                  PKGVERSION=version,
                  AUTHOR=author,
                  MAINTAINER=maintainer,
                  LIC=license)

  res <- createPackage(pkgname=MafDbPkgName,
                       destinationDir=destDir,
                       originDir=system.file("MafDbPkgTemplate", package=thisPkgName),
                       symbolValues=symvals)

  suc <- file.copy(from=system.file("MafDbPkgTemplate", package=thisPkgName),
                   to=file.path(destDir, MafDbPkgName, "inst"), recursive=TRUE)
  if (!suc)
    stop(sprintf("Cannot write in the %s directory.", file.path(destDir, MafDbPkgName, "inst")))

  frozenDataDir <- file.path(destDir, MafDbPkgName, "inst", "extdata")

  dbnameKG <- freezeKnownVariantsKG(sourceURL=MafDbURL, targetdir=frozenDataDir,
                                    MafDbFilename=paste0(MafDbPkgName, ".sqlite"),
                                    genome=genome, yieldSize=yieldSize)

  message(sprintf("A new MafDb package has been created in directory %s.", file.path(destDir, MafDbPkgName)))
  message(sprintf("Please run the command lines:\n\n   R CMD build --keep-empty-dirs %s", file.path(destDir, MafDbPkgName)))
  message(sprintf("   R CMD INSTALL %s_%s.tar.gz", MafDbPkgName, version))
  message("\nto install the package on R.")

  invisible(res$pkgdir)
}
