This file contains step-by-step instructions on how to obtain the data and deconvolute phenotypes from pathogen infection screens, as shown in gespeR: A statistical model for deconvoluting off-target-confounded RNA interference screens (Schmich et al., 2015). For full compatibility, please download the development version of gespeR at: https://github.com/fschmich/gespeR


Downloading data

Phenotypic data

  1. Go to https://pubchem.ncbi.nlm.nih.gov/assay/assay.cgi?aid=1117357&viewcode=X, replacing X with the viewcode provided (data is on hold until March 19, 2016).
  2. At the top table, in the description field, click View All Data.
  3. Scroll to the bottom of the page. Under Result Exports choose to save BioAssay as CSV file to /tmp/phenotypes.csv.
  4. Download the mapping from PubChem to Vendor siRNA IDs from http://www.ncbi.nlm.nih.gov/pcsubstance/?term=%22InfectX+Consortium%22%5Bsourcename%5D. On the top right of the page, press Send To -> File -> Format: ID Map and save to /tmp/mapping.txt.

Target relation matrices

  1. Go to https://www1.ethz.ch/bsse/cbg/software/gespeR.
  2. Download the following siRNA-to-gene target relation matrices
    • Qiagen libraries to /tmp/QIAGEN.rds
    • Dharmacon library to /tmp/DHARMACON.rds
    • Validation library to /tmp/VALIDATION.rds
  3. Alternatively, from within R, type
sapply(c("QIAGEN", "DHARMACON", "VALIDATION"), function(x) {
  download.file(url = sprintf("http://n.ethz.ch/~fschmich/gespeR/%s.rds", x),
                destfile = sprintf("/tmp/%s.rds", x), quiet = FALSE, mode = "wb")
})

Preprocessing data

Target relation matrices

require(Matrix)
require(gespeR)
# Construct TargetRelations objects for each library
Q <- TargetRelations("/tmp/QIAGEN.rds")
show(Q)
## 91003 x 27240 siRNA-to-gene relations.
## 10 x 5 sparse Matrix of class "dgCMatrix"
##            1    2 3          9 10
## SI00000007 . 0.75 . .           .
## SI00000035 . .    . .           .
## SI00000063 . .    . 0.10992427  .
## SI00000070 . .    . .           .
## SI00000077 . .    . .           .
## SI00000084 . .    . .           .
## SI00000105 . .    . .           .
## SI00000112 . .    . 0.09310967  .
## SI00000119 . .    . .           .
## SI00000133 . .    . .           .
## ...

Phenotypic data

require(dplyr)
require(reshape2)
require(gespeR)
# Read phenotypes
phenotypes <- read.delim("/tmp/phenotypes.csv", 
                         sep = ",", stringsAsFactors = FALSE) %>% 
  tbl_df() %>%
  select(SID, GeneID = NCBI.Gene.ID, siRNASet = SIRNA_SET, contains("Infectivity")) %>% 
  melt(id.vars = c("SID", "siRNASet", "GeneID")) %>% 
  tbl_df() %>%
  select(SID = SID, GeneID, siRNASet, Pathogen = variable, Phenotype = value) %>%
  mutate(Pathogen = gsub("Infectivity_", "", Pathogen),
         SID = as.character(SID)) %>%
  filter(!is.na(Phenotype)) %>% # Artifact of how data is deposited in Pubchem
  arrange(SID)
head(phenotypes)
## Source: local data frame [6 x 5]
## 
##         SID GeneID siRNASet   Pathogen Phenotype
## 1 249376050     53        5 Salmonella  0.354975
## 2 249376051     53        5 Salmonella  1.588140
## 3 249376052     53        5 Salmonella  2.014610
## 4 249376053    373        5 Salmonella -1.078850
## 5 249376054    379        5 Salmonella -1.452680
## 6 249376055   9275        5 Salmonella  0.342689
# Read ID mapping between SIDs and Vendor IDs
map <- read.delim("/tmp/mapping.txt", header = FALSE, stringsAsFactors = FALSE)
map <- map[seq(2, nrow(map), by = 2),]
map <- data.frame(t(sapply(map, function(x) {
  unlist(strsplit(x, split = "SID: | InfectX Consortium: "))[2:3]
})), stringsAsFactors = FALSE) %>% tbl_df()
rownames(map) <- NULL
colnames(map) <- c("SID", "VendorID")

# Map IDs
phenotypes <- left_join(phenotypes, map, by = "SID") %>% 
  tbl_df() %>% 
  select(SID, VendorID, siRNASet, GeneID, Pathogen, Phenotype)
head(phenotypes)
## Source: local data frame [6 x 6]
## 
##         SID VendorID siRNASet GeneID   Pathogen Phenotype
## 1 249376050    10031        5     53 Salmonella  0.354975
## 2 249376051    10122        5     53 Salmonella  1.588140
## 3 249376052    10210        5     53 Salmonella  2.014610
## 4 249376053    10235        5    373 Salmonella -1.078850
## 5 249376054    10240        5    379 Salmonella -1.452680
## 6 249376055    10268        5   9275 Salmonella  0.342689
# Construct Phenotypes objects for each (Qiagen) library + pathogen combination
obs.ssp <- obs.gsp <- list()
phenotypes <- split(phenotypes, phenotypes$Pathogen)
for (pathogen in names(phenotypes)) {
  obs.ssp[[pathogen]] <- obs.gsp[[pathogen]] <- list()
  for (s in 1:4) {
    spl <- filter(phenotypes[[pathogen]], siRNASet == s, VendorID %in% Q@siRNAs)
    obs.ssp[[pathogen]][[s]] <- Phenotypes(phenotypes = Matrix(spl$Phenotype),
                                           ids = spl$VendorID, 
                                           pnames = c("Infectivity"),
                                           type = "SSP")
    spl.noNA <- filter(spl, !is.na(GeneID)) %>% 
      group_by(GeneID) %>% 
      summarise(Phenotype = mean(Phenotype, na.rm = TRUE)) %>% 
      filter(!is.nan(Phenotype))
    # We need gene-based Phenotypes objects for concordance evaluation
    obs.gsp[[pathogen]][[s]] <- Phenotypes(phenotypes = Matrix(spl.noNA$Phenotype), 
                                           ids = as.character(spl.noNA$GeneID), 
                                           pnames = c("Infectivity"),
                                           type = "GSP")
  }
}
show(obs.ssp$Bartonella[[1]])
## 20087 SSP Phenotypes
## 
## Source: local data frame [20,087 x 2]
## 
##            ID Infectivity
## 1  SI00000035  -1.2441600
## 2  SI00000077   2.0732300
## 3  SI00000112  -1.0704600
## 4  SI00000168  -1.7540600
## 5  SI00000266  -0.8546760
## 6  SI00000399  -0.4619630
## 7  SI00000420   1.3742000
## 8  SI00000476   0.4220430
## 9  SI00000518  -1.7471500
## 10 SI00000567   0.0507219
## ..        ...         ...

Fitting gespeR models

require(gespeR)
# Fit gespeR models
ans.cv <- list()
for (pathogen in c("Bartonella", "Brucella", "Salmonella")) {
  ans.cv[[pathogen]] <- list()
  for (s in 1:4) {
    cat(sprintf("set: %d, pathogen: %s\n", s, pathogen))
    ges <- gespeR(phenotypes = obs.ssp[[pathogen]][[s]],
                                      target.relations = Q,
                                      mode = "cv",
                                      alpha = 0.5,
                                      ncores = 1)
    ans.cv[[pathogen]][[s]] <- unloadValues(ges, writeValues = FALSE)
  }
}
# Obtain gene-specific phenotypes (GSPs)
ges.gsp <- lapply(ans.cv, function(x) {
  lapply(x, gsp)
})

Concordance evaluation

require(gespeR)
require(ggplot2)
# Function computes concordance between all pairs of phenotypes. Measures used
# are Spearman's correlation, rank-biased overlap and the Jaccard index.
# Observed phenotypes are cut to the same length as gespeR GSPs, respecting the
# proportion of negative and positive phenotypes, in order to guarantee fair
# comparison.
get.conc <- function(phen, cut = NULL) {
  min.overlap = 10
  rbo.k = 1000
  rbo.p = 1-1e-3
  rbo.mid <- 0
  cor.method = "spearman"
  uneven.lengths = TRUE
  if (!is.null(cut)) { # cut longer ranked lists to gespeR's lengths
    lapply(names(phen), function(x) {
      l <- lapply(cut[[x]], function(z) {
        ans <- as.data.frame(z)
        list(pos = length(which(ans$Infectivity > 0)), neg = length(which(ans$Infectivity < 0)))
      })
      phencut <- lapply(phen[[x]], function(y) {
          as.data.frame(y) %>% 
            tbl_df() %>%
            mutate(ID = as.character(ID)) %>%
            filter(!is.na(Infectivity)) %>%
            arrange(desc(Infectivity))
      })
      for (lib in 1:length(l)) {
        len <- l[[lib]]
        a <- nrow(phencut[[lib]]) - len$neg + 1
        b <- nrow(phencut[[lib]])
        phencut[[lib]] <- phencut[[lib]][c(1:len$pos, a:b),]
        phencut[[lib]] <- Phenotypes(phenotypes = Matrix(phencut[[lib]]$Infectivity),
                                     ids = phencut[[lib]]$ID, 
                                     pnames = c("Infectivity"),
                                     type = "SSP")
      }
      concordance(phencut, 
                  min.overlap = min.overlap, 
                  rbo.k = rbo.k, 
                  rbo.p = rbo.p,
                  cor.method = cor.method,
                  rbo.mid = rbo.mid,
                  uneven.lengths = uneven.lengths) %>% 
        data.frame %>% 
        select(-lisect) %>%
        melt(id.vars = c("test.pair", "phen")) %>%
        mutate(Method = "SSP", Pathogen = x) %>%
        select(-test.pair, Method, Pathogen, Measure = variable, value) %>%
        tbl_df()
    }) %>% do.call("rbind", .)
  } else {
    lapply(names(phen), function(x) {
      concordance(phen[[x]], 
                  min.overlap = min.overlap, 
                  rbo.k = rbo.k, 
                  rbo.p = rbo.p, 
                  cor.method = cor.method,
                  rbo.mid = rbo.mid,
                  uneven.lengths = uneven.lengths) %>% 
        data.frame %>% 
        select(-lisect) %>%
        melt(id.vars = c("test.pair", "phen")) %>%
        mutate(Method = "gespeR", Pathogen = x) %>%
        select(-test.pair, Method, Pathogen, Measure = variable, value) %>%
        tbl_df()
    }) %>% do.call("rbind", .)
  }
}
# Computation of concordance for gespeR GSPs and observed phenotypes
conc.gespeR <- get.conc(ges.gsp)
conc.obs <- get.conc(obs.gsp, cut = ges.gsp)

# Visualisation of concordance measures
dat <- rbind(conc.gespeR, conc.obs) %>% tbl_df() %>%
  mutate(Pathogen = factor(Pathogen, levels = c("Brucella", "Bartonella", "Salmonella"), 
                           labels = c("B. abortus", "B. henselae", "S. typhimurium")),
         Method = factor(Method, levels = c("gespeR", "SSP"), 
                         labels = c("gespeR", "Observed")))
dat$Measure <- factor(dat$Measure, 
                      levels = c("cor", "rbo.top", "rbo.bottom", "jaccard"), 
                      labels = c(expression(rho), expression(rbo["" %down% ""]),
                                 expression(rbo["" %up% ""]), expression("J")))
ggplot(data = dat, aes(x = Pathogen, y = value, colour = Method)) + 
      geom_boxplot(outlier.size = 0, width = 0.8) + 
      facet_grid(. ~ Measure, labeller = label_parsed) +
      xlab("") + ylab("") +
      scale_colour_manual("", values = c("#d7191c", "#525252"), drop = FALSE) +
      ylim(c(0, 1)) +
      theme_bw(base_size = 12, base_family = "Helvetica") + 
      theme(axis.text = element_text(size = rel(1.0)),
            axis.title = element_text(size = rel(1.0), face = "bold"),
            strip.text = element_text(size = rel(1.0), face = "bold"),
            axis.ticks = element_line(colour = "black"),
            legend.key = element_rect(colour = NA),
            legend.text = element_text(size = rel(1.0)),
            legend.title = element_text(size = rel(1.0), face = "bold"),
            panel.background = element_rect(fill = "white", colour = NA),
            panel.border = element_rect(fill = NA, colour = "grey50"),
            panel.grid.major = element_line(colour = "grey90", size = 0.2),
            panel.grid.minor = element_line(colour = "grey98", size = 0.5),
            strip.background = element_blank(),
            axis.text.x = element_text(angle = 45, hjust = 1))

Session information

sessionInfo()
## R version 3.2.0 (2015-04-16)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.9.5 (Mavericks)
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] reshape2_1.4.1 dplyr_0.4.2    gespeR_1.1.1   ggplot2_1.0.1 
## [5] Matrix_1.2-1  
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.11.6           mvtnorm_1.0-2         lattice_0.20-31      
##  [4] assertthat_0.1        glmnet_2.0-2          digest_0.6.8         
##  [7] foreach_1.4.2         R6_2.0.1              GenomeInfoDb_1.4.1   
## [10] plyr_1.8.3            stats4_3.2.0          pcaPP_1.9-60         
## [13] RSQLite_1.0.0         evaluate_0.7          BiocInstaller_1.18.3 
## [16] zlibbioc_1.14.0       lazyeval_0.1.10       annotate_1.46.0      
## [19] S4Vectors_0.6.0       preprocessCore_1.30.0 rmarkdown_0.7        
## [22] labeling_0.3          proto_0.3-10          splines_3.2.0        
## [25] stringr_1.0.0         RCurl_1.95-4.6        biomaRt_2.24.0       
## [28] munsell_0.4.2         BiocGenerics_0.14.0   htmltools_0.2.6      
## [31] IRanges_2.2.4         codetools_0.2-11      XML_3.98-1.2         
## [34] rrcov_1.3-8           MASS_7.3-41           bitops_1.0-6         
## [37] grid_3.2.0            RBGL_1.44.0           prada_1.44.0         
## [40] xtable_1.7-4          GSEABase_1.30.2       gtable_0.1.2         
## [43] affy_1.46.1           DBI_0.3.1             magrittr_1.5         
## [46] formatR_1.2           scales_0.2.5          graph_1.46.0         
## [49] stringi_0.5-2         genefilter_1.50.0     affyio_1.36.0        
## [52] doParallel_1.0.8      limma_3.24.10         robustbase_0.92-4    
## [55] RColorBrewer_1.1-2    iterators_1.0.7       tools_3.2.0          
## [58] Biobase_2.28.0        Category_2.34.2       DEoptimR_1.0-2       
## [61] parallel_3.2.0        survival_2.38-2       yaml_2.1.13          
## [64] AnnotationDbi_1.30.1  colorspace_1.2-6      cluster_2.0.2        
## [67] vsn_3.36.0            cellHTS2_2.32.0       knitr_1.10.5