K-nearest neighbors:

We read in input.scone.csv, which is our file modified (and renamed) from the get.marker.names() function. The K-nearest neighbor generation is derived from the Fast Nearest Neighbors (FNN) R package, within our function Fnn(), which takes as input the “input markers” to be used, along with the concatenated data previously generated, and the desired k. We advise the default selection to the total number of cells in the dataset divided by 100, as has been optimized on existing mass cytometry datasets. The output of this function is a matrix of each cell and the identity of its k-nearest neighbors, in terms of its row number in the dataset used here as input.

library(Sconify)
# Markers from the user-generated excel file
marker.file <- system.file('extdata', 'markers.csv', package = "Sconify")
markers <- ParseMarkers(marker.file)

# How to convert your excel sheet into vector of static and functional markers
markers
## $input
##  [1] "CD3(Cd110)Di"           "CD3(Cd111)Di"           "CD3(Cd112)Di"          
##  [4] "CD235-61-7-15(In113)Di" "CD3(Cd114)Di"           "CD45(In115)Di"         
##  [7] "CD19(Nd142)Di"          "CD22(Nd143)Di"          "IgD(Nd145)Di"          
## [10] "CD79b(Nd146)Di"         "CD20(Sm147)Di"          "CD34(Nd148)Di"         
## [13] "CD179a(Sm149)Di"        "CD72(Eu151)Di"          "IgM(Eu153)Di"          
## [16] "Kappa(Sm154)Di"         "CD10(Gd156)Di"          "Lambda(Gd157)Di"       
## [19] "CD24(Dy161)Di"          "TdT(Dy163)Di"           "Rag1(Dy164)Di"         
## [22] "PreBCR(Ho165)Di"        "CD43(Er167)Di"          "CD38(Er168)Di"         
## [25] "CD40(Er170)Di"          "CD33(Yb173)Di"          "HLA-DR(Yb174)Di"       
## 
## $functional
##  [1] "pCrkL(Lu175)Di"  "pCREB(Yb176)Di"  "pBTK(Yb171)Di"   "pS6(Yb172)Di"   
##  [5] "cPARP(La139)Di"  "pPLCg2(Pr141)Di" "pSrc(Nd144)Di"   "Ki67(Sm152)Di"  
##  [9] "pErk12(Gd155)Di" "pSTAT3(Gd158)Di" "pAKT(Tb159)Di"   "pBLNK(Gd160)Di" 
## [13] "pP38(Tm169)Di"   "pSTAT5(Nd150)Di" "pSyk(Dy162)Di"   "tIkBa(Er166)Di"
# Get the particular markers to be used as knn and knn statistics input
input.markers <- markers[[1]]
funct.markers <- markers[[2]]

# Selection of the k. See "Finding Ideal K" vignette
k <- 30

# The built-in scone functions
wand.nn <- Fnn(cell.df = wand.combined, input.markers = input.markers, k = k)
# Cell identity is in rows, k-nearest neighbors are columns
# List of 2 includes the cell identity of each nn, 
#   and the euclidean distance between
#   itself and the cell of interest

# Indices
str(wand.nn[[1]])
##  int [1:1000, 1:30] 330 694 268 715 328 678 772 241 924 713 ...
wand.nn[[1]][1:20, 1:10]
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,]  330  351  533  337 1000  207  453  614  496   250
##  [2,]  694  571  763  996  374  203   74  832  971   585
##  [3,]  268  246  124  807  366  713  253  579   10   665
##  [4,]  715  352  462  135  876  798  960  112  893   116
##  [5,]  328  728  224  737  585  966  987  568  272     7
##  [6,]  678  558  916  397  532  981   85    9   65   392
##  [7,]  772  828  224  372  669  123   61  974  764   485
##  [8,]  241  582  759  885  122  856  695  219  924   233
##  [9,]  924  219  981  548  422  759  631  532  780   397
## [10,]  713  624  268  107    3  807  498  454  366   836
## [11,]  830  564  403  904  515  176  688  760  516   968
## [12,]  666  844  934  428  886  710  516  635  730   502
## [13,]  532  631  885  134  390  675  637  759  668   916
## [14,]  727  638  604  211  227  139  627  709  592   110
## [15,]  244  982  971  694  237  383  170  423  395   338
## [16,]  641  155  679  338  374  423  608  269  237   447
## [17,]  918  467  619  455  760  652   11  226  525   176
## [18,]  294  652  382  428  760  833  619  730  157   490
## [19,]  415  474  224  720  482  713  677  485  152   577
## [20,]  176  525  830  467  403  619  515  526  652   487
# Distance
str(wand.nn[[2]])
##  num [1:1000, 1:30] 3.97 2.73 2.47 3.22 3.36 ...
wand.nn[[2]][1:20, 1:10]
##           [,1]     [,2]     [,3]     [,4]     [,5]     [,6]     [,7]     [,8]
##  [1,] 3.965078 4.070074 4.092915 4.120268 4.164760 4.203589 4.392209 4.493517
##  [2,] 2.731685 2.823732 2.903874 3.131379 3.143527 3.301202 3.392992 3.395590
##  [3,] 2.469439 2.648087 2.675868 2.742256 2.811036 2.826947 2.840275 2.894083
##  [4,] 3.217416 3.420631 3.694704 3.768833 3.773349 3.782414 3.820494 3.840619
##  [5,] 3.364253 3.408951 3.410414 3.439393 3.574460 3.748010 3.795265 3.805915
##  [6,] 3.311876 3.393849 3.422467 3.492138 3.867315 3.873742 3.918866 3.970579
##  [7,] 2.827524 2.894920 3.077649 3.150315 3.203723 3.249002 3.341154 3.413131
##  [8,] 3.965427 4.090961 4.133583 4.347810 4.448111 4.575837 4.578400 4.595127
##  [9,] 3.016322 3.263121 3.277102 3.314453 3.330960 3.385717 3.428579 3.428636
## [10,] 2.700998 2.768821 2.809791 2.889148 2.963344 2.999035 3.021868 3.102046
## [11,] 3.221092 3.337421 3.537185 3.538971 3.717566 3.759464 3.825997 3.941702
## [12,] 5.096301 5.203686 5.312953 5.441803 5.455912 5.551291 5.561407 5.577394
## [13,] 3.190725 3.346728 3.373073 3.411534 3.587619 3.592624 3.649726 3.654425
## [14,] 4.167909 4.283862 4.310059 4.492409 4.505718 4.526049 4.614907 4.740029
## [15,] 3.410314 3.523987 3.710101 3.764044 3.793746 3.847133 3.861519 4.026012
## [16,] 2.994630 3.356877 3.550283 3.574922 3.622886 3.660669 3.665880 3.692212
## [17,] 4.366468 4.511866 4.516966 4.558253 4.601060 4.609032 4.942514 4.971157
## [18,] 4.257825 4.534855 4.652209 4.747423 4.964602 5.022785 5.090182 5.159841
## [19,] 2.856397 3.088440 3.137766 3.220843 3.308511 3.336355 3.368790 3.417319
## [20,] 3.391197 3.800335 4.062841 4.145820 4.230996 4.241192 4.273084 4.276462
##           [,9]    [,10]
##  [1,] 4.588153 4.595689
##  [2,] 3.438974 3.473952
##  [3,] 2.963344 2.973477
##  [4,] 3.847745 3.859671
##  [5,] 3.825955 3.838767
##  [6,] 4.059805 4.181442
##  [7,] 3.458267 3.490039
##  [8,] 4.601476 4.605055
##  [9,] 3.439355 3.489788
## [10,] 3.112000 3.150887
## [11,] 3.997109 4.009321
## [12,] 5.651117 5.773673
## [13,] 3.694518 3.734646
## [14,] 5.062603 5.195376
## [15,] 4.044000 4.045845
## [16,] 3.692374 3.706538
## [17,] 4.988718 4.999579
## [18,] 5.200807 5.317304
## [19,] 3.417989 3.422648
## [20,] 4.306341 4.309361

Finding scone values:

This function iterates through each KNN, and performs a series of calculations. The first is fold change values for each maker per KNN, where the user chooses whether this will be based on medians or means. The second is a statistical test, where the user chooses t test or Mann-Whitney U test. I prefer the latter, because it does not assume any properties of the distributions. Of note, the p values are adjusted for false discovery rate, and therefore are called q values in the output of this function. The user also inputs a threshold parameter (default 0.05), where the fold change values will only be shown if the corresponding statistical test returns a q value below said threshold. Finally, the “multiple.donor.compare” option, if set to TRUE will perform a t test based on the mean per-marker values of each donor. This is to allow the user to make comparisons across replicates or multiple donors if that is relevant to the user’s biological questions. This function returns a matrix of cells by computed values (change and statistical test results, labeled either marker.change or marker.qvalue). This matrix is intermediate, as it gets concatenated with the original input matrix in the post-processing step (see the relevant vignette). We show the code and the output below. See the post-processing vignette, where we show how this gets combined with the input data, and additional analysis is performed.

wand.scone <- SconeValues(nn.matrix = wand.nn, 
                      cell.data = wand.combined, 
                      scone.markers = funct.markers, 
                      unstim = "basal")

wand.scone
## # A tibble: 1,000 x 34
##    `pCrkL(Lu175)Di.IL… `pCREB(Yb176)Di.IL… `pBTK(Yb171)Di.IL… `pS6(Yb172)Di.IL7…
##                  <dbl>               <dbl>              <dbl>              <dbl>
##  1               0.956               0.428              0.842              0.993
##  2               0.507               0.628              0.842              0.855
##  3               0.723               0.648              0.903              0.993
##  4               0.818               0.601              0.781              0.982
##  5               0.663               0.541              0.804              0.993
##  6               0.663               1                  0.914              0.965
##  7               0.683               0.315              0.781              0.982
##  8               0.967               0.698              0.649              0.974
##  9               0.905               0.947              0.903              0.999
## 10               0.656               0.531              0.766              0.965
## # … with 990 more rows, and 30 more variables: cPARP(La139)Di.IL7.qvalue <dbl>,
## #   pPLCg2(Pr141)Di.IL7.qvalue <dbl>, pSrc(Nd144)Di.IL7.qvalue <dbl>,
## #   Ki67(Sm152)Di.IL7.qvalue <dbl>, pErk12(Gd155)Di.IL7.qvalue <dbl>,
## #   pSTAT3(Gd158)Di.IL7.qvalue <dbl>, pAKT(Tb159)Di.IL7.qvalue <dbl>,
## #   pBLNK(Gd160)Di.IL7.qvalue <dbl>, pP38(Tm169)Di.IL7.qvalue <dbl>,
## #   pSTAT5(Nd150)Di.IL7.qvalue <dbl>, pSyk(Dy162)Di.IL7.qvalue <dbl>,
## #   tIkBa(Er166)Di.IL7.qvalue <dbl>, pCrkL(Lu175)Di.IL7.change <dbl>,
## #   pCREB(Yb176)Di.IL7.change <dbl>, pBTK(Yb171)Di.IL7.change <dbl>,
## #   pS6(Yb172)Di.IL7.change <dbl>, cPARP(La139)Di.IL7.change <dbl>,
## #   pPLCg2(Pr141)Di.IL7.change <dbl>, pSrc(Nd144)Di.IL7.change <dbl>,
## #   Ki67(Sm152)Di.IL7.change <dbl>, pErk12(Gd155)Di.IL7.change <dbl>,
## #   pSTAT3(Gd158)Di.IL7.change <dbl>, pAKT(Tb159)Di.IL7.change <dbl>,
## #   pBLNK(Gd160)Di.IL7.change <dbl>, pP38(Tm169)Di.IL7.change <dbl>,
## #   pSTAT5(Nd150)Di.IL7.change <dbl>, pSyk(Dy162)Di.IL7.change <dbl>,
## #   tIkBa(Er166)Di.IL7.change <dbl>, IL7.fraction.cond.2 <dbl>, density <dbl>

For programmers: performing additional per-KNN statistics

If one wants to export KNN data to perform other statistics not available in this package, then I provide a function that produces a list of each cell identity in the original input data matrix, and a matrix of all cells x features of its KNN.

I also provide a function to find the KNN density estimation independently of the rest of the “scone.values” analysis, to save time if density is all the user wants. With this density estimation, one can perform interesting analysis, ranging from understanding phenotypic density changes along a developmental progression (see post-processing vignette for an example), to trying out density-based binning methods (eg. X-shift). Of note, this density is specifically one divided by the aveage distance to k-nearest neighbors. This specific measure is related to the Shannon Entropy estimate of that point on the manifold (https://hal.archives-ouvertes.fr/hal-01068081/document).

I use this metric to avoid the unusual properties of the volume of a sphere as it increases in dimensions (https://en.wikipedia.org/wiki/Volume_of_an_n-ball). This being said, one can modify this vector to be such a density estimation (example http://www.cs.haifa.ac.il/~rita/ml_course/lectures_old/KNN.pdf), by treating the distance to knn as the radius of a n-dimensional sphere and incoroprating said volume accordingly.

An individual with basic programming skills can iterate through these elements to perform the statistics of one’s choosing. Examples would include per-KNN regression and classification, or feature imputation. The additional functionality is shown below, with the example knn.list in the package being the first ten instances:

# Constructs KNN list, computes KNN density estimation
wand.knn.list <- MakeKnnList(cell.data = wand.combined, nn.matrix = wand.nn)
wand.knn.list[[8]]
## # A tibble: 30 x 51
##    `CD3(Cd110)Di` `CD3(Cd111)Di` `CD3(Cd112)Di` `CD235-61-7-15(I… `CD3(Cd114)Di`
##             <dbl>          <dbl>          <dbl>             <dbl>          <dbl>
##  1        -0.720         -0.0535        -0.985             -1.08         -0.759 
##  2        -0.669         -0.241          0.515              0.816        -0.0304
##  3        -0.0558        -0.0951        -0.193             -0.324        -0.0136
##  4        -0.439         -0.346          0.138              0.609        -0.784 
##  5         0.0474        -0.0432        -0.387             -0.325        -0.184 
##  6        -1.39          -0.148         -1.70              -1.30         -1.53  
##  7        -0.129         -0.398         -0.0986             0.186         0.375 
##  8        -0.0688         0.199         -0.447             -0.571        -0.320 
##  9        -0.358         -0.270          1.55              -0.700        -0.306 
## 10        -0.713         -0.109         -1.60              -0.356        -1.12  
## # … with 20 more rows, and 46 more variables: CD45(In115)Di <dbl>,
## #   CD19(Nd142)Di <dbl>, CD22(Nd143)Di <dbl>, IgD(Nd145)Di <dbl>,
## #   CD79b(Nd146)Di <dbl>, CD20(Sm147)Di <dbl>, CD34(Nd148)Di <dbl>,
## #   CD179a(Sm149)Di <dbl>, CD72(Eu151)Di <dbl>, IgM(Eu153)Di <dbl>,
## #   Kappa(Sm154)Di <dbl>, CD10(Gd156)Di <dbl>, Lambda(Gd157)Di <dbl>,
## #   CD24(Dy161)Di <dbl>, TdT(Dy163)Di <dbl>, Rag1(Dy164)Di <dbl>,
## #   PreBCR(Ho165)Di <dbl>, CD43(Er167)Di <dbl>, CD38(Er168)Di <dbl>,
## #   CD40(Er170)Di <dbl>, CD33(Yb173)Di <dbl>, HLA-DR(Yb174)Di <dbl>,
## #   Time <dbl>, Cell_length <dbl>, cPARP(La139)Di <dbl>, pPLCg2(Pr141)Di <dbl>,
## #   pSrc(Nd144)Di <dbl>, pSTAT5(Nd150)Di <dbl>, Ki67(Sm152)Di <dbl>,
## #   pErk12(Gd155)Di <dbl>, pSTAT3(Gd158)Di <dbl>, pAKT(Tb159)Di <dbl>,
## #   pBLNK(Gd160)Di <dbl>, pSyk(Dy162)Di <dbl>, tIkBa(Er166)Di <dbl>,
## #   pP38(Tm169)Di <dbl>, pBTK(Yb171)Di <dbl>, pS6(Yb172)Di <dbl>,
## #   pCrkL(Lu175)Di <dbl>, pCREB(Yb176)Di <dbl>, DNA1(Ir191)Di <dbl>,
## #   DNA2(Ir193)Di <dbl>, Viability1(Pt195)Di <dbl>, Viability2(Pt196)Di <dbl>,
## #   wanderlust <dbl>, condition <chr>
# Finds the KNN density estimation for each cell, ordered by column, in the 
# original data matrix
wand.knn.density <- GetKnnDe(nn.matrix = wand.nn)
str(wand.knn.density)
##  num [1:1000] 0.217 0.289 0.333 0.252 0.257 ...