## How it's built:
require(GEOquery)
hm450k <- getGEO('GPL13534')@dataTable@table
names(hm450k) <- toupper(names(hm450k))
hm27k <- getGEO('GPL8490')@dataTable@table
names(hm27k) <- toupper(names(hm27k))
hm450k$PLATFORM <- 'HM450'
hm450k$CHANNEL27 <- NA
hm450k$CHANNEL450 <- hm450k$COLOR_CHANNEL
hm450k$ADDRESSA_450 <- hm450k$ADDRESSA_ID
hm450k$ADDRESSB_450 <- hm450k$ADDRESSB_ID
hm450k$ADDRESSA_27 <- NA
hm450k$ADDRESSB_27 <- NA
hm27k$PLATFORM <- 'HM27'
hm450k$PLATFORM[ which(hm450k$NAME %in% hm27k$NAME) ] <- 'BOTH'
hm27k$PLATFORM[ which(hm27k$NAME %in% hm450k$NAME) ] <- 'BOTH'
hm27k$CHANNEL27 <- hm27k$COLOR_CHANNEL
hm27k$CHANNEL450 <- NA
hm27k$ADDRESSA_450 <- NA
hm27k$ADDRESSB_450 <- NA
hm27k$ADDRESSA_27 <- hm27k$ADDRESSA_ID
hm27k$ADDRESSB_27 <- hm27k$ADDRESSB_ID
hm27k$STRAND <- '*' ## this we do the brute-force way
commonColumns <- intersect( names(hm450k), names(hm27k) )
rownames(hm450k) <- hm450k$ID
rownames(hm27k) <- hm27k$ID

## this makes it relatively painless to move from hg18 to hg19
df2GR <- function(df, keepColumns=FALSE, ignoreStrand=FALSE){ # {{{
  require(GenomicRanges)
  stopifnot(class(df) == "data.frame")
  subs <- list(chromStart='start', chromEnd='end', chrom='chr', seqnames='chr')
  for(s in names(subs)) names(df) = gsub(s, subs[[s]], names(df), ignore=TRUE)
  stopifnot(all(c("start", "end") %in% names(df)))
  if('genome' %in% names(attributes(df))) g <- attr(df, 'genome') else g <- NULL
  if(substr(df$chr, 1, 3)[1] != 'chr') df$chr <- paste('chr', df$chr, sep='')
  df <- subset(df, !is.na(start) & !is.na(end))
  if(!ignoreStrand && ("strand" %in% names(df))) {
    if(is.numeric(df$strand)) df$strand <- strandMe(df$strand)
    GR <- with(df, GRanges(chr, IRanges(start=start, end=end), strand=strand))
  } else {
    GR <- with(df, GRanges(chr, IRanges(start=start, end=end)))
  }
  if('name' %in% names(df)) {
    names(GR) <- df$name
    df$name <- NULL
  } else {
    names(GR) <- rownames(df)
  }
  if(keepColumns) {
    skipped = c("rangename","chr","start","end","width","strand")
    elementMetadata(GR) <- as(df[, setdiff(names(df), skipped), drop=F],
                              "DataFrame")
  }
  if('X' %in% names(elementMetadata(GR))) {
    if(all(is.na(GR$X))) {
      GR$X <- NULL
    } else {
      names(elementMetadata(GR))[which(names(elementMetadata(GR))=='X')]='score'
    }
  }
  if(!is.null(g)) genome(GR) <- g
  return(GR)
} # }}}

## now we need to update the hm27k probe annotations:
hm27k$start <- hm27k$end <- as.numeric(hm27k$MAPINFO)
hm27k$chrom <- hm27k$CHR
hm27k$name <- hm27k$ID 
data(hg18ToHg19) # to lift old probes from hg18 to hg19
library(rtracklayer)
hm27k.gr <- df2GR(hm27k)
genome(hm27k.gr) <- 'hg18'
values(hm27k.gr)$name <- names(hm27k.gr)
data(hg18ToHg19)
hm27k.lifted <- unlist(liftOver(hm27k.gr, hg18ToHg19))
names(hm27k.lifted) <- values(hm27k.lifted)[,1]
genome(hm27k.lifted) <- 'hg19'

## we have 4 probes that just don't map; zero them out
## 
skip <- which(!(rownames(hm27k) %in% names(hm27k.lifted)))
noLift <- setdiff(rownames(hm27k), names(hm27k.lifted))
lifted <- intersect(rownames(hm27k), names(hm27k.lifted))
## hm27k[ noLift, 'CHR' ]
## [3] "17" "7"  "7"  "17"
## 
## Even allowing up to 5 mismatches they still don't map!
## Everything else, update it in the table for hm27k:
##
## identical( rownames(hm27k)[ -skip ], lifted) ## can check if desired
hm27k$CHR[ -skip ] <- gsub('chr','',as(hm27k.lifted, 'data.frame')$seqnames)
hm27k$MAPINFO[ -skip ] <- start(hm27k.lifted)
hm27k$CHR[ which(rownames(hm27k) %in% noLift) ] <- 'Un' ## this becomes 'chrUn'
hm27k$MAPINFO[ which(rownames(hm27k) %in% noLift) ] <- 0
kept <- match(rownames(hm27k)[which(hm27k$PLATFORM=='BOTH')],rownames(hm450k))
hm450k$CHANNEL27[ kept ] = hm27k$CHANNEL27[ which(hm27k$PLATFORM == 'BOTH') ]
hm450k$ADDRESSA_27[ kept ] = hm27k$ADDRESSA_27[which(hm27k$PLATFORM == 'BOTH')]
hm450k$ADDRESSB_27[ kept ] = hm27k$ADDRESSB_27[which(hm27k$PLATFORM == 'BOTH')]

hm27k.allProbes <- hm27k ## in case we need it later, if we screw up
hm27k <- hm27k[ which(hm27k$PLATFORM == 'HM27'), ] ## only old probes
keepColumns <- c('ADDRESSA_450','ADDRESSB_450', 'ADDRESSA_27', 'ADDRESSB_27',
                 'CHANNEL27','CHANNEL450', 'CHR','MAPINFO','SOURCESEQ',
                 'STRAND','PLATFORM','NAME')
InfiniumMethylation <- rbind(hm450k[, keepColumns], hm27k[, keepColumns])

## note that SNP probes do not have MAPINFO; will fix this from dbSNP
noMap <- which(is.na(as.numeric(InfiniumMethylation$MAPINFO)))
noMap.ids <- InfiniumMethylation$NAME[ noMap ]
message(paste('Note: GEO is missing genomic coordinates for probes',
              paste(noMap.ids, collapse=', ')))
data(hm450k.rsProbes) # extracted from dbSNP
InfiniumMethylation$CHR[noMap] = gsub('chr','',hm450k.rsProbes[noMap.ids,'CHR'])
InfiniumMethylation$MAPINFO[ noMap ] = hm450k.rsProbes[ noMap.ids, 'MAPINFO' ]
InfiniumMethylation$STRAND[ noMap ] = '*'

##
## now build the GRanges that will become the FeatureDb:
##
## scaffolding: 
##
require(Biostrings)
require(GenomicRanges)
sourceSeq <- with(InfiniumMethylation, DNAStringSet(SOURCESEQ))
gcContent <- round(letterFrequency(sourceSeq, letters='GC', as.prob=T), 2)
Infinium.GR <- with(InfiniumMethylation,
                 GRanges(paste0('chr', CHR), 
                         IRanges(as.numeric(MAPINFO), width=1),
                         strand=as.factor(ifelse(STRAND == 'F','-','+')),
                         addressA_450=Rle(ADDRESSA_450),
                         addressB_450=Rle(ADDRESSB_450),
                         addressA_27=Rle(ADDRESSA_27),
                         addressB_27=Rle(ADDRESSB_27),
                         channel450=Rle(as.factor(CHANNEL450)),
                         channel27=Rle(as.factor(CHANNEL27)),
                         probeType=Rle(as.factor(substr(NAME, 1, 2))),
                         percentGC=as.vector(gcContent),
                         platform=Rle(as.factor(PLATFORM)),
                         sourceSeq=DNAStringSet(SOURCESEQ)
                        )
                 )
names(Infinium.GR) <- InfiniumMethylation$NAME
both = setdiff(levels(values(Infinium.GR)$channel450)[1], c('Grn','Red'))
swap = which(levels(values(Infinium.GR)$channel450) == both)
levels(values(Infinium.GR)$channel450)[ swap ] <- 'Both'
genome(Infinium.GR) <- 'hg19'

## fix stranding and check dinucleotide sequence in the reference genome
##
library(BSgenome.Hsapiens.UCSC.hg19)
resizeToStart <- which(
  getSeq(Hsapiens, resize(Infinium.GR, 2, fix='start'), as.char=T) == 'CG'
)
resizeToEnd <- which(
  getSeq(Hsapiens, resize(Infinium.GR, 2, fix='end'), as.char=T) == 'CG'
)
Infinium.GR[resizeToStart] <- resize(Infinium.GR[resizeToStart], 2, fix='start')
Infinium.GR[ resizeToEnd ] <- resize(Infinium.GR[ resizeToEnd ], 2, fix='end')
restranded = length(resizeToEnd) + length(resizeToStart)
unstranded = length(Infinium.GR) - restranded # not bad! mostly CpH + SNP probes
print(paste(restranded, 'probes restranded, only', unstranded, 'left to go...'))

## fix CpH probes
##
lastLetter <- function(x) substr(x, nchar(x), nchar(x))
strand(Infinium.GR)[ which(lastLetter(names(Infinium.GR)) == 'F') ] <- '-'
strand(Infinium.GR)[ which(lastLetter(names(Infinium.GR)) == 'R') ] <- '+'
Infinium.GR <- resize(Infinium.GR, 
  width=ifelse(as.vector(values(Infinium.GR)$probeType)=='ch',3,2), 
  fix=ifelse(as.vector(strand(Infinium.GR))=='+','start','end')
)

## fix SNP probes (again)
##
start(Infinium.GR[ noMap.ids ]) = hm450k.rsProbes[ noMap.ids, 'MAPINFO' ]
width(Infinium.GR[ noMap.ids ]) = 1
strand(Infinium.GR) = '*'

## It turns out that the HM27k chip DOES have SNP probes.  Label these:
##
data(hm27.controls)
data(hm27.SNP.colors)
hm27.SNP.controls <- hm27.controls[grep('^rs', hm27.controls$Name), ]
for(i in unique(hm27.SNP.controls$Name)) {
  i = gsub('_$','',i)
  if( i %in% names(Infinium.GR) & !is.na(hm27.SNP.colors[i]) ) {
    addresses = hm27.SNP.controls$Address[ grep(i, hm27.SNP.controls$Name) ]
    values(Infinium.GR[ i ])$addressA_27 = addresses[1]
    values(Infinium.GR[ i ])$addressB_27 = addresses[2]
    values(Infinium.GR[ i ])$channel27 = hm27.SNP.colors[i]
    values(Infinium.GR[ i ])$platform = 'BOTH'
  }
}
#Infinium.GR[na.omit(match(unique(hm27.SNP.controls$Name), names(Infinium.GR)))]

## Now order the probes based on their name
##
Infinium.GR <- Infinium.GR[order(names(Infinium.GR))]

## Create the FDb, and save it.
##
FDb.InfiniumMethylation.hg19 = GenomicRangesToFeatureDb(
  Infinium.GR, 
  URL='ftp://ftp.illumina.com', 
  tableName='InfiniumMethylation', 
  src='NCBI/GEO and dbSNP', 
  label='Illumina Infinium DNA methylation probes, aligned to hg19'
)
saveFeatures(FDb.InfiniumMethylation.hg19, 
             file='FDb.InfiniumMethylation.hg19.sqlite')

## Now verify that it comes out the same as it went in (well, almost)
## The extra lines of code are, in my view, a bug in GenomicFeatures
## Also, the types of variables (Rle, numeric, etc.) are not retained
## Nonetheless, this is a pretty handy structure for storing the data.
##
FDb.InfiniumMethylation.hg19 <- loadDb('FDb.InfiniumMethylation.hg19.sqlite')
InfiniumMethylation <- features(FDb.InfiniumMethylation.hg19)
names(InfiniumMethylation) <- values(InfiniumMethylation)$name
met <- metadata(FDb.InfiniumMethylation.hg19) ## need to fetch genome
genome(InfiniumMethylation) <- met[ which(met[,'name'] == 'Genome'), 'value' ]
show(InfiniumMethylation)

## If you wanted to compute on the source sequences, e.g. observed/expected CpGs
##
values(InfiniumMethylation)$sourceSeq <- DNAStringSet(
  values(InfiniumMethylation)$sourceSeq
)

## To fit GC-dependent intensity models (e.g. for segmentation or preprocessing)
##
values(InfiniumMethylation)$percentGC <- as.numeric(
  values(InfiniumMethylation)$percentGC
)

## For a smaller object to save:
##
values(InfiniumMethylation)$addressA <- Rle(
  as.numeric(values(InfiniumMethylation)$addressA)
)
values(InfiniumMethylation)$addressB <- Rle(
  as.numeric(values(InfiniumMethylation)$addressB)
)
values(InfiniumMethylation)$probeType <- Rle(
  as.factor(values(InfiniumMethylation)$probeType)
)
values(InfiniumMethylation)$platform <- Rle(
  as.factor(values(InfiniumMethylation)$platform)
)
values(InfiniumMethylation)$channel <- Rle(
  as.factor(values(InfiniumMethylation)$channel)
)
values(InfiniumMethylation)$design <- Rle(
  as.factor(values(InfiniumMethylation)$design)
)
names(InfiniumMethylation) <- values(InfiniumMethylation)$name
values(InfiniumMethylation)$name <- NULL
show(InfiniumMethylation)

## by probe type:
split(InfiniumMethylation, values(InfiniumMethylation)$probeType)

## by chromosome:
split(InfiniumMethylation, seqnames(InfiniumMethylation))

## by color channel:
split(InfiniumMethylation, values(InfiniumMethylation)$channel)

## by platform, with a note about 'design':
Infinium.by.platform <- function() {
  on450k <- which(values(InfiniumMethylation)$platform %in% c('HM450','BOTH'))
  on27k <-  which(values(InfiniumMethylation)$platform %in% c('HM27','BOTH'))
  GRangesList( HM450=InfiniumMethylation[ on450k ],
               HM27=InfiniumMethylation[ on27k ] )
}
by.platform <- Infinium.by.platform()
names(by.platform)
