K-nearest neighbors:

We read in input.scone.csv, which is our file modified (and renamed) from the get.marker.names() function. The K-nearest neighbor generation is derived from the Fast Nearest Neighbors (FNN) R package, within our function Fnn(), which takes as input the “input markers” to be used, along with the concatenated data previously generated, and the desired k. We advise the default selection to the total number of cells in the dataset divided by 100, as has been optimized on existing mass cytometry datasets. The output of this function is a matrix of each cell and the identity of its k-nearest neighbors, in terms of its row number in the dataset used here as input.

library(Sconify)
# Markers from the user-generated excel file
marker.file <- system.file('extdata', 'markers.csv', package = "Sconify")
markers <- ParseMarkers(marker.file)

# How to convert your excel sheet into vector of static and functional markers
markers
## $input
##  [1] "CD3(Cd110)Di"           "CD3(Cd111)Di"          
##  [3] "CD3(Cd112)Di"           "CD235-61-7-15(In113)Di"
##  [5] "CD3(Cd114)Di"           "CD45(In115)Di"         
##  [7] "CD19(Nd142)Di"          "CD22(Nd143)Di"         
##  [9] "IgD(Nd145)Di"           "CD79b(Nd146)Di"        
## [11] "CD20(Sm147)Di"          "CD34(Nd148)Di"         
## [13] "CD179a(Sm149)Di"        "CD72(Eu151)Di"         
## [15] "IgM(Eu153)Di"           "Kappa(Sm154)Di"        
## [17] "CD10(Gd156)Di"          "Lambda(Gd157)Di"       
## [19] "CD24(Dy161)Di"          "TdT(Dy163)Di"          
## [21] "Rag1(Dy164)Di"          "PreBCR(Ho165)Di"       
## [23] "CD43(Er167)Di"          "CD38(Er168)Di"         
## [25] "CD40(Er170)Di"          "CD33(Yb173)Di"         
## [27] "HLA-DR(Yb174)Di"       
## 
## $functional
##  [1] "pCrkL(Lu175)Di"  "pCREB(Yb176)Di"  "pBTK(Yb171)Di"  
##  [4] "pS6(Yb172)Di"    "cPARP(La139)Di"  "pPLCg2(Pr141)Di"
##  [7] "pSrc(Nd144)Di"   "Ki67(Sm152)Di"   "pErk12(Gd155)Di"
## [10] "pSTAT3(Gd158)Di" "pAKT(Tb159)Di"   "pBLNK(Gd160)Di" 
## [13] "pP38(Tm169)Di"   "pSTAT5(Nd150)Di" "pSyk(Dy162)Di"  
## [16] "tIkBa(Er166)Di"
# Get the particular markers to be used as knn and knn statistics input
input.markers <- markers[[1]]
funct.markers <- markers[[2]]

# Selection of the k. See "Finding Ideal K" vignette
k <- 30

# The built-in scone functions
wand.nn <- Fnn(cell.df = wand.combined, input.markers = input.markers, k = k)
# Cell identity is in rows, k-nearest neighbors are columns
# List of 2 includes the cell identity of each nn, 
#   and the euclidean distance between
#   itself and the cell of interest

# Indices
str(wand.nn[[1]])
##  int [1:1000, 1:30] 622 870 691 793 185 51 470 648 358 499 ...
wand.nn[[1]][1:20, 1:10]
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,]  622  928  823  110  609  776  591  656  206   803
##  [2,]  870  204   97  358   91  168  603  420   98   696
##  [3,]  691  228  187  804  176  843  816  120  642   677
##  [4,]  793   91  499  540  816  478  420  555   97   893
##  [5,]  185  600  183  631  483  961  929  830  475   786
##  [6,]   51  784  946  408  755  271  274   93   85   416
##  [7,]  470  888  161  400  323  303  839  380  306   300
##  [8,]  648  519  368  217  652  607  101  221  455   308
##  [9,]  358  238  404  631  786  716  890  921  929   319
## [10,]  499  975  164  293  909  582  364  816  798    91
## [11,]  697  467  244  991  983  346  572  772  271   191
## [12,]  288  512  259  160  594  887  719   13  908   756
## [13,]  188  160  887  133  756  594  908  288   12   601
## [14,]  908  188  288   13  952  887  756  594  762   725
## [15,]  919  524  300  392   42  708   19   45  256   173
## [16,]  271   38  568  342  896  982  741  448  903   242
## [17,]  986  146  319  965   27  412  231  485   49   974
## [18,]   93  767  543  695   50  626  126  551  309   784
## [19,]   29  326  919  566  773   42   15  747  247   174
## [20,]  552   89  688  822  473  217  895  926  953   581
# Distance
str(wand.nn[[2]])
##  num [1:1000, 1:30] 16.4 10.6 15.8 11.1 17.6 ...
wand.nn[[2]][1:20, 1:10]
##            [,1]      [,2]      [,3]      [,4]      [,5]      [,6]
##  [1,] 16.383190 16.571811 18.653130 19.879021 20.773181 22.323089
##  [2,] 10.558453 10.983828 12.063963 12.547400 12.657836 12.840222
##  [3,] 15.826808 16.024975 16.072841 16.175392 16.829647 18.057126
##  [4,] 11.133191 11.136051 12.981159 13.973247 14.060726 14.164527
##  [5,] 17.633241 19.723446 20.048174 20.419896 20.675723 21.109719
##  [6,]  6.408746  8.846825  9.397899 10.006398 10.884576 11.229829
##  [7,] 10.977200 11.952767 12.793220 13.593829 14.853916 16.617766
##  [8,]  8.832842  9.754949 10.920389 11.158428 11.296905 11.569888
##  [9,]  8.009838  9.724828 12.064816 12.619212 12.769342 12.833446
## [10,]  8.143524 10.305750 10.529389 12.241582 12.503860 12.519685
## [11,]  8.656984  9.125214 10.596465 10.749750 10.991817 11.251921
## [12,]  8.262410 12.579773 17.012079 17.119662 17.143223 19.129197
## [13,] 12.078006 14.551226 15.414581 15.664579 16.073398 16.294076
## [14,] 14.602047 16.870434 22.443291 22.933792 23.014462 23.112841
## [15,]  9.550121  9.646311  9.841216  9.963899 10.644395 10.919139
## [16,] 11.056313 11.847643 13.499677 13.704764 15.050652 16.616376
## [17,]  8.396840 10.425115 11.219143 11.608211 11.741120 12.233398
## [18,] 15.724607 16.735962 17.086505 17.239205 19.785690 20.938288
## [19,]  9.461759  9.574205 10.426373 10.545497 10.629218 11.096676
## [20,]  6.502150  7.423296  8.016033  8.341058  8.350312  8.769996
##            [,7]      [,8]      [,9]     [,10]
##  [1,] 22.911132 24.257496 24.829509 24.968521
##  [2,] 13.097222 13.724592 13.905108 14.342221
##  [3,] 18.412797 18.481790 18.552500 18.581846
##  [4,] 15.155614 15.526608 15.542667 16.764868
##  [5,] 22.409380 22.932324 23.004560 23.297250
##  [6,] 11.399652 11.981049 12.184966 12.251861
##  [7,] 17.330806 17.569522 17.615606 17.684615
##  [8,] 11.619787 11.763009 12.022792 12.040336
##  [9,] 13.103806 13.575239 13.714620 13.906064
## [10,] 14.306216 15.230310 15.399281 15.636467
## [11,] 11.390838 12.456292 12.854690 13.443651
## [12,] 19.567007 19.683361 19.936071 20.306642
## [13,] 17.854225 19.048397 19.683361 20.252489
## [14,] 23.874535 28.497303 28.661184 29.183219
## [15,] 11.210361 11.282178 11.325266 12.064564
## [16,] 16.970435 17.685089 17.915440 17.935181
## [17,] 12.241202 12.885562 12.894533 13.014241
## [18,] 21.115189 21.173584 21.567418 21.630625
## [19,] 11.210361 11.312744 12.162692 12.300244
## [20,]  8.870509  8.928113  8.938418  9.234881

Finding scone values:

This function iterates through each KNN, and performs a series of calculations. The first is fold change values for each maker per KNN, where the user chooses whether this will be based on medians or means. The second is a statistical test, where the user chooses t test or Mann-Whitney U test. I prefer the latter, because it does not assume any properties of the distributions. Of note, the p values are adjusted for false discovery rate, and therefore are called q values in the output of this function. The user also inputs a threshold parameter (default 0.05), where the fold change values will only be shown if the corresponding statistical test returns a q value below said threshold. Finally, the “multiple.donor.compare” option, if set to TRUE will perform a t test based on the mean per-marker values of each donor. This is to allow the user to make comparisons across replicates or multiple donors if that is relevant to the user’s biological questions. This function returns a matrix of cells by computed values (change and statistical test results, labeled either marker.change or marker.qvalue). This matrix is intermediate, as it gets concatenated with the original input matrix in the post-processing step (see the relevant vignette). We show the code and the output below. See the post-processing vignette, where we show how this gets combined with the input data, and additional analysis is performed.

wand.scone <- SconeValues(nn.matrix = wand.nn, 
                      cell.data = wand.combined, 
                      scone.markers = funct.markers, 
                      unstim = "basal")

wand.scone
## # A tibble: 1,000 x 34
##    `pCrkL(Lu175)Di.I… `pCREB(Yb176)Di.I… `pBTK(Yb171)Di.… `pS6(Yb172)Di.I…
##                 <dbl>              <dbl>            <dbl>            <dbl>
##  1              1                  0.902            0.889            1    
##  2              0.996              0.979            0.832            0.963
##  3              0.927              0.917            0.979            0.988
##  4              0.993              0.979            0.992            1    
##  5              0.937              0.987            0.822            0.963
##  6              0.937              0.979            0.815            0.963
##  7              0.937              0.889            0.891            1    
##  8              0.976              1                0.822            1    
##  9              0.937              0.949            0.822            0.963
## 10              0.929              0.923            0.962            1    
## # ... with 990 more rows, and 30 more variables:
## #   `cPARP(La139)Di.IL7.qvalue` <dbl>, `pPLCg2(Pr141)Di.IL7.qvalue` <dbl>,
## #   `pSrc(Nd144)Di.IL7.qvalue` <dbl>, `Ki67(Sm152)Di.IL7.qvalue` <dbl>,
## #   `pErk12(Gd155)Di.IL7.qvalue` <dbl>,
## #   `pSTAT3(Gd158)Di.IL7.qvalue` <dbl>, `pAKT(Tb159)Di.IL7.qvalue` <dbl>,
## #   `pBLNK(Gd160)Di.IL7.qvalue` <dbl>, `pP38(Tm169)Di.IL7.qvalue` <dbl>,
## #   `pSTAT5(Nd150)Di.IL7.qvalue` <dbl>, `pSyk(Dy162)Di.IL7.qvalue` <dbl>,
## #   `tIkBa(Er166)Di.IL7.qvalue` <dbl>, `pCrkL(Lu175)Di.IL7.change` <dbl>,
## #   `pCREB(Yb176)Di.IL7.change` <dbl>, `pBTK(Yb171)Di.IL7.change` <dbl>,
## #   `pS6(Yb172)Di.IL7.change` <dbl>, `cPARP(La139)Di.IL7.change` <dbl>,
## #   `pPLCg2(Pr141)Di.IL7.change` <dbl>, `pSrc(Nd144)Di.IL7.change` <dbl>,
## #   `Ki67(Sm152)Di.IL7.change` <dbl>, `pErk12(Gd155)Di.IL7.change` <dbl>,
## #   `pSTAT3(Gd158)Di.IL7.change` <dbl>, `pAKT(Tb159)Di.IL7.change` <dbl>,
## #   `pBLNK(Gd160)Di.IL7.change` <dbl>, `pP38(Tm169)Di.IL7.change` <dbl>,
## #   `pSTAT5(Nd150)Di.IL7.change` <dbl>, `pSyk(Dy162)Di.IL7.change` <dbl>,
## #   `tIkBa(Er166)Di.IL7.change` <dbl>, IL7.fraction.cond.2 <dbl>,
## #   density <dbl>

For programmers: performing additional per-KNN statistics

If one wants to export KNN data to perform other statistics not available in this package, then I provide a function that produces a list of each cell identity in the original input data matrix, and a matrix of all cells x features of its KNN.

I also provide a function to find the KNN density estimation independently of the rest of the “scone.values” analysis, to save time if density is all the user wants. With this density estimation, one can perform interesting analysis, ranging from understanding phenotypic density changes along a developmental progression (see post-processing vignette for an example), to trying out density-based binning methods (eg. X-shift). Of note, this density is specifically one divided by the aveage distance to k-nearest neighbors. This specific measure is related to the Shannon Entropy estimate of that point on the manifold (https://hal.archives-ouvertes.fr/hal-01068081/document).

I use this metric to avoid the unusual properties of the volume of a sphere as it increases in dimensions (https://en.wikipedia.org/wiki/Volume_of_an_n-ball). This being said, one can modify this vector to be such a density estimation (example http://www.cs.haifa.ac.il/~rita/ml_course/lectures_old/KNN.pdf), by treating the distance to knn as the radius of a n-dimensional sphere and incoroprating said volume accordingly.

An individual with basic programming skills can iterate through these elements to perform the statistics of one’s choosing. Examples would include per-KNN regression and classification, or feature imputation. The additional functionality is shown below, with the example knn.list in the package being the first ten instances:

# Constructs KNN list, computes KNN density estimation
wand.knn.list <- MakeKnnList(cell.data = wand.combined, nn.matrix = wand.nn)
wand.knn.list[[8]]
## # A tibble: 30 x 51
##    `CD3(Cd110)Di` `CD3(Cd111)Di` `CD3(Cd112)Di` `CD235-61-7-15(In113)Di`
##             <dbl>          <dbl>          <dbl>                    <dbl>
##  1        -0.196        -0.247          -0.189                    -0.655
##  2        -0.0239       -0.00583         0.652                     0.340
##  3         0.946        -0.155          -0.0662                   -1.13 
##  4         0.504        -0.0952         -0.0422                    0.453
##  5        -0.823        -0.238          -0.627                    -0.819
##  6        -0.232        -0.238          -0.208                    -0.952
##  7        -0.237         1.55           -0.288                     0.490
##  8         0.722        -0.113           1.70                     -0.162
##  9        -0.180        -0.101          -0.123                    -1.61 
## 10        -0.156        -0.297          -0.660                    -1.48 
## # ... with 20 more rows, and 47 more variables: `CD3(Cd114)Di` <dbl>,
## #   `CD45(In115)Di` <dbl>, `CD19(Nd142)Di` <dbl>, `CD22(Nd143)Di` <dbl>,
## #   `IgD(Nd145)Di` <dbl>, `CD79b(Nd146)Di` <dbl>, `CD20(Sm147)Di` <dbl>,
## #   `CD34(Nd148)Di` <dbl>, `CD179a(Sm149)Di` <dbl>, `CD72(Eu151)Di` <dbl>,
## #   `IgM(Eu153)Di` <dbl>, `Kappa(Sm154)Di` <dbl>, `CD10(Gd156)Di` <dbl>,
## #   `Lambda(Gd157)Di` <dbl>, `CD24(Dy161)Di` <dbl>, `TdT(Dy163)Di` <dbl>,
## #   `Rag1(Dy164)Di` <dbl>, `PreBCR(Ho165)Di` <dbl>, `CD43(Er167)Di` <dbl>,
## #   `CD38(Er168)Di` <dbl>, `CD40(Er170)Di` <dbl>, `CD33(Yb173)Di` <dbl>,
## #   `HLA-DR(Yb174)Di` <dbl>, Time <dbl>, Cell_length <dbl>,
## #   `cPARP(La139)Di` <dbl>, `pPLCg2(Pr141)Di` <dbl>,
## #   `pSrc(Nd144)Di` <dbl>, `pSTAT5(Nd150)Di` <dbl>, `Ki67(Sm152)Di` <dbl>,
## #   `pErk12(Gd155)Di` <dbl>, `pSTAT3(Gd158)Di` <dbl>,
## #   `pAKT(Tb159)Di` <dbl>, `pBLNK(Gd160)Di` <dbl>, `pSyk(Dy162)Di` <dbl>,
## #   `tIkBa(Er166)Di` <dbl>, `pP38(Tm169)Di` <dbl>, `pBTK(Yb171)Di` <dbl>,
## #   `pS6(Yb172)Di` <dbl>, `pCrkL(Lu175)Di` <dbl>, `pCREB(Yb176)Di` <dbl>,
## #   `DNA1(Ir191)Di` <dbl>, `DNA2(Ir193)Di` <dbl>,
## #   `Viability1(Pt195)Di` <dbl>, `Viability2(Pt196)Di` <dbl>,
## #   wanderlust <dbl>, condition <chr>
# Finds the KNN density estimation for each cell, ordered by column, in the 
# original data matrix
wand.knn.density <- GetKnnDe(nn.matrix = wand.nn)
str(wand.knn.density)
##  num [1:1000] 0.0391 0.0677 0.0526 0.0585 0.042 ...