.dbAddData <- function(conn, sql, bind.data)
{
    dbBeginTransaction(conn)
    dbGetPreparedQuery(conn, sql, bind.data)
    dbCommit(conn)
}

.loadMissense <- function(file)
{
    COLNAMES <- scan(file, what="character", nlines=1)
    COLNAMES <- gsub("#", "", unlist(COLNAMES, use.names=FALSE), fixed=TRUE) 
    dat <- read.delim(file, strip.white=TRUE, stringsAsFactors=FALSE,
        na.strings="?", col.names=COLNAMES)
    data.frame(rsid=dat$snp_id, dat)
}

.loadDuplicate <- function(file)
{
    ## no header
    rawdata <- readLines(file)
    lines <- strsplit(rawdata, "\t")
    group <- rep(seq_len(length(lines)), lapply(lines, length))
    data.frame("rsid"=unlist(lines), "duplicate_group"=group)
}

.writeMetadata <- function(conn)
{
    rsqlite_version <- installed.packages()['RSQLite', 'Version']
    mat <- matrix(c(
        "Db type", "PolyPhenDb",
        "Data source", "PolyPhen2",
        "Genome", "hg19",
        "Genus and Species", "Homo sapiens",
        "Resource URL",
            "http://genetics.bwh.harvard.edu/pph2/dokuwiki/downloads",
        "dbSNP build", "131",
        "Creation time",   svn.time(),
        "RSQLite version at creation time",
            installed.packages()['RSQLite', 'Version'],
        "package", "VariantAnnotation"),
        ncol=2, byrow=TRUE
    )
    colnames(mat) <- c("name", "value")
    metadata <- data.frame(name=mat[ , "name"], value=mat[ , "value"],
        stringsAsFactors=FALSE)
    dbWriteTable(conn, "metadata", metadata, row.names=FALSE)
    cat("done with 'metadata' table \n")
}

.writeDuplicates <- function(conn, file)
{
    table <- .loadDuplicate(file)
    sql <- c(
        "CREATE TABLE duplicates (\n",
        " RSID TEXT NOT NULL,\n",
        " DUPLICATEGROUP INTEGER \n",
        ")")
    dbGetQuery(conn, paste(sql, collapse=""))
    dbGetQuery(conn, "CREATE INDEX duplicates_index on duplicates (RSID)")
 
    sql <- "INSERT INTO duplicates VALUES (?,?)"
    .dbAddData(conn, sql, table)
    cat("done with 'duplicates' table \n")
}

.writePPdata <- function(conn, humvar, humdiv)
{
    humvar <- .loadMissense(humvar)
    humdiv <- .loadMissense(humdiv)

    if (!all(humvar$rsid %in% humdiv$rsid))
        stop("not all rsid's in ", basename(humvar), "are in ",
            basename(humdiv), sep="")

    if (!all(colnames(humvar) %in% colnames(humdiv)))
        stop("not all columns in ", basename(humvar), "are in ",
            basename(humdiv), sep="")

    ## combine HumVar-short and HumDiv-full tables
    hvexp <- data.frame(matrix(nrow=nrow(humvar), ncol=ncol(humdiv)))
    colnames(hvexp) <- colnames(humdiv) 
    idx <- match(colnames(humvar), colnames(humdiv))

    for(i in 1:length(idx)) 
        hvexp[,idx[i]] <- humvar[,i]
    cmb <- rbind(humdiv, hvexp)
    training_set <- c(rep("humdiv", nrow(humdiv)), rep("humvar", nrow(humvar)))
    combo <- data.frame(rsid=cmb[,1], training_set=training_set, cmb[,-1]) 

    sql <- c(
        "CREATE TABLE ppdata (\n",
        " RSID TEXT NOT NULL,\n",
        " TRAININGSET TEXT,\n",
        " OSNPID TEXT,\n",
        " OACC INTEGER,\n",
        " OPOS INTEGER,\n",
        " OAA1 TEXT,\n",
        " OAA2 TEXT,\n",
        " SNPID TEXT NOT NULL,\n",
        " ACC TEXT,\n",
        " POS TEXT,\n",
        " AA1 TEXT,\n",
        " AA2 TEXT,\n",
        " NT1 TEXT,\n",
        " NT2 TEXT,\n",
        " PREDICTION TEXT,\n",
        " BASEDON TEXT,\n",
        " EFFECT TEXT,\n",
        " PPH2CLASS TEXT,\n",
        " PPH2PROB REAL,\n",
        " PPH2FPR REAL,\n",
        " PPH2TPR REAL,\n",
        " PPH2FDR REAL,\n",
        " SITE TEXT,\n",
        " REGION TEXT,\n",
        " PHAT INTEGER,\n",
        " DSCORE REAL,\n",
        " SCORE1 REAL,\n",
        " SCORE2 REAL,\n",
        " NOBS INTEGER,\n",
        " NSTRUCT INTEGER,\n",
        " NFILT INTEGER,\n",
        " PDBID TEXT,\n",
        " PDBPOS INTEGER,\n",
        " PDBCH TEXT,\n",
        " IDENT REAL,\n",
        " LENGTH INTEGER,\n",
        " NORMACC REAL,\n",
        " SECSTR TEXT,\n",
        " MAPREG TEXT,\n",
        " DVOL INTEGER,\n",
        " DPROP REAL,\n",
        " BFACT REAL,\n",
        " HBONDS TEXT,\n",
        " AVENHET TEXT,\n",
        " MINDHET TEXT,\n",
        " AVENINT TEXT,\n",
        " MINDINT TEXT,\n",
        " AVENSIT TEXT,\n",
        " MINDSIT TEXT,\n",
        " TRANSV INTEGER,\n",
        " CODPOS INTEGER,\n",
        " CPG INTEGER,\n",
        " MINDJNC TEXT,\n",
        " PFAMHIT REAL,\n",
        " IDPMAX REAL,\n",
        " IDPSNP REAL,\n",
        " IDQMIN REAL,\n",
        " COMMENTS TEXT \n",
        ")")
    dbGetQuery(conn, paste(sql, collapse=""))
    dbGetQuery(conn, "CREATE INDEX rsid_index on ppdata (RSID)")

    qmarks <- paste(rep("?", ncol(combo)), collapse=",")
    sql <- paste("INSERT INTO ppdata VALUES (", qmarks, ")", sep="") 
    .dbAddData(conn, sql, combo)
    cat("done with 'ppdata' table \n")
}

makePolyPhenDb <- function(filepath, savepath, dbfile, 
                           hvsfile, hdffile, dsfile, ...)
{
    if (length(list(...)) != 0L)
        warning("extra args are ignored for now")

    conn <- dbConnect(SQLite(), dbname=dbfile)
    .writePPdata(conn, hvsfile, hdffile)
    .writeDuplicates(conn, dsfile)
    .writeMetadata(conn)
    dbDisconnect(conn)
    cat(paste("creation of ", basename(dbfile), " complete \n", sep=""))
}




