This tutorial will introduce you to the core functionalities of the openPrimeR package:
If you’re already accustomed with these functions, you can skip this tutorial. Otherwise, let’s start with attaching the openPrimeR package:
library(openPrimeR)
Let’s try to load the sequences from the following FASTA file containing germline cDNA from human immunoglobulin heavy chains :
fasta.file <- system.file("extdata", "IMGT_data", "templates",
"Homo_sapiens_IGH_functional_exon.fasta", package = "openPrimeR")
Please use an editor to get acquainted with the format of the file and then take a look at the template sequences using read_templates()
.
# Load the template sequences from 'fasta.file' and view the results
fasta.file <- system.file("extdata", "IMGT_data", "templates",
"Homo_sapiens_IGH_functional_exon.fasta", package = "openPrimeR")
# Load the template sequences from 'fasta.file'
fasta.file <- system.file("extdata", "IMGT_data", "templates",
"Homo_sapiens_IGH_functional_exon.fasta", package = "openPrimeR")
template.df <- read_templates(fasta.file)
asS3(template.df)
As you can see, the templates were successfully loaded; Please explore the loaded templates. You can change columns by using the top right and left arrows. Comparing the loaded templates with the original FASTA file, we can see that the metadata contained in the headers of the FASTA file were not annotated by read_templates()
.
To store the metadata correctly, you can use the hdr.structure
and delim
arguments of read_templates()
, where hdr.structure
provides a character vector of identifiers for information contained in the header and delim
is a single character that is used in the header of the FASTA file to delimit individual metadata. When supplying the keyword ‘GROUP’ in the hdr.structure
vector, template groups are annotated with their groups, which is relevant for visualizing properties of the templates later. Now, it’s your turn: Try to load the the template sequences with their annotated IGHV groups:
# Load the template sequences such that their groups are annotated correctly
fasta.file <- system.file("extdata", "IMGT_data", "templates",
"Homo_sapiens_IGH_functional_exon.fasta", package = "openPrimeR")
hdr.structure <- NULL # adjust according to the FASTA header
delim <- NULL # adjust according to the FASTA header
template.df <- read_templates(fasta.file, hdr.structure = hdr.structure,
delim = delim)
# Load the template sequences such that their groups are annotated correctly
fasta.file <- system.file("extdata", "IMGT_data", "templates",
"Homo_sapiens_IGH_functional_exon.fasta", package = "openPrimeR")
hdr.structure <- c("ACCESSION", "GROUP", "SPECIES", "FUNCTION")
delim <- "|"
template.df <- read_templates(fasta.file, hdr.structure = hdr.structure,
delim = "|")
Note that, if not specified otherwise via the id.column
argument, the first entry of hdr.structure
is used as the template identifier. We can verify that the template groups are available now via:
fasta.file <- system.file("extdata", "IMGT_data", "templates",
"Homo_sapiens_IGH_functional_exon.fasta", package = "openPrimeR")
hdr.structure <- c("ACCESSION", "GROUP", "SPECIES", "FUNCTION")
delim <- "|"
template.df <- read_templates(fasta.file, hdr.structure = hdr.structure,
delim = "|")
print(template.df$Group)
Next, we will move on to defining the binding region of the primers in the templates. Since we deal with immunological templates, we want to annotate the leader region for each sequence individually. For this purpose, we will use a FASTA file containing the leader sequences corresponding to the templates:
leader.fasta <- system.file("extdata", "IMGT_data", "templates",
"Homo_sapiens_IGH_functional_leader.fasta", package = "openPrimeR")
Again, you may want to take a look at the structure of the file before continuing. The entries in the file containing the individual binding regions should match those in the template FASTA file and the provided regions should be subsequences of the loaded sequences.
We can define the binding region in the templates using assign_binding_regions()
; since we only care about binding of the forward primers, we will only adjust the forward binding region.
# Assign the forward binding regions from 'leader.fasta' to 'template.df':
leader.fasta <- system.file("extdata", "IMGT_data", "templates",
"Homo_sapiens_IGH_functional_leader.fasta", package = "openPrimeR")
# Assign the forward binding regions from 'leader.fasta' to 'template.df':
leader.fasta <- system.file("extdata", "IMGT_data", "templates",
"Homo_sapiens_IGH_functional_leader.fasta", package = "openPrimeR")
template.df <- assign_binding_regions(template.df, fw = leader.fasta, rev = NULL)
We can verify that the binding regions for forward primers were annotated successfully in the following way:
fasta.file <- system.file("extdata", "IMGT_data", "templates",
"Homo_sapiens_IGH_functional_exon.fasta", package = "openPrimeR")
hdr.structure <- c("ACCESSION", "GROUP", "SPECIES", "FUNCTION")
delim <- "|"
template.df <- read_templates(fasta.file, hdr.structure = hdr.structure,
delim = "|")
leader.fasta <- system.file("extdata", "IMGT_data", "templates",
"Homo_sapiens_IGH_functional_leader.fasta", package = "openPrimeR")
template.df <- assign_binding_regions(template.df, fw = leader.fasta, rev = NULL)
utils::head(template.df$Allowed_fw)
## [1] "atggactggacctggagcatccttttcttggtggcagcaccaacaggtgcccactcc"
## [2] "atggactggacctggagcatccttttcttggtggcagcagcaacaggtgcccactcc"
## [3] "atggactggacctggagcatccttttcttggtggcagcagcaacaggtgcccactcc"
## [4] "atggactggacctggaggatcctcttcttggtggcagcagccacaggagcccactcc"
## [5] "atggactggacctggaggatcctcttcttggtggcagcagccacaggagcccactcc"
## [6] "atggactggacctggaggatcctcttcttggtggcagcagccacaggagcccactcc"
cbind(utils::head(template.df$Allowed_Start_fw), utils::head(template.df$Allowed_End_fw))
## [,1] [,2]
## [1,] 1 57
## [2,] 1 57
## [3,] 1 57
## [4,] 1 57
## [5,] 1 57
## [6,] 1 57
where Allowed_fw
contains the sequence of the leader region (the region before the exon) and Allowed_Start_fw
and Allowed_End_fw
provide the interval of allowed binding positions in the templates for forward primers.
As a final step, we need to modify the binding region for forward primer such that the binding region includes the first position of the exon, which lies outside the leader. Hence, we will extend the current binding region by one position. For this purpose, adjust_binding_regions
can be used. This function requires a modified binding range, relative to the previously annotated binding region. The relative positions are defined such that position 0 is the first position after the end of the annotated binding region. For example, assigning the interval [0, 30] would allow binding within the first 30 positions of the exon. Next please extend the binding regions annotation of template.df
by one position via adjust_binding_regions()
:
# Extend the binding regions of 'template.df' by one position:
# Extend the binding regions of 'template.df' by one position:
template.df <- adjust_binding_regions(template.df,
c(-max(template.df$Allowed_End_fw), 0), NULL)
# Verify the new annotation:
head(cbind(template.df$Allowed_Start_fw, template.df$Allowed_End_fw))
Note that we have chosen the maximum relative position for the start of the binding region to not affect any change, while we have extended the end of the binding region to the 0-th position, where 0 indicates the start of the exon.
Existing sets of primers can also be loaded from FASTA files. Why don’t you try loading the following primer FASTA file using read_primers()
into the primer.df
variable?
fasta.file <- system.file("extdata", "IMGT_data", "primers",
"IGHV", "Tiller2008_1st.fasta", package = "openPrimeR")
# Load the prmiers from 'fasta.file' into 'primer.df' and explore the data
fasta.file <- system.file("extdata", "IMGT_data", "primers",
"IGHV", "Tiller2008_1st.fasta", package = "openPrimeR")
# Load the prmiers from 'fasta.file' into 'primer.df' and explore the data
fasta.file <- system.file("extdata", "IMGT_data", "primers",
"IGHV", "Tiller2008_1st.fasta", package = "openPrimeR")
primer.df <- read_primers(fasta.file)
print(primer.df)
As we can see, the data frame only contains entries in the Forward
, but not the Reverse
column indicating that this is a set of forward primers only. Moreover, we obtain some basic information about the primers such as their length via primer_length_fw
and their degeneracy via Degeneracy_fw
. As a careful reader, you may now wonder how it was possible to annotate the primers with their directionalities without specifying any identifiers.
This was possible because the input FASTA file already provided the default keywords used by read_primers
for identifying the directions of the primers, which you verify by finding that the tag _fw
is shared by all entries in the ID
column. For your own primers, it is crucial that you tag the directionality of the primers in the FASTA file correctly and then specify the respective keywords using the fw.id
and rev.id
arguments in case that you are deviating from the defaults.
There’s a lot of experience with performing PCRs and therefore we know which physicochemical properties of primers are favorable for amplification and which are not. In openPrimeR, the desired ranges of values of these properties are called constraints
and they are stored in a DesignSettings
object.
Let’s define some constraints by loading one of the XML settings files that are shipped with the package by storing the result of read_settings()
in the settings
variable:
# Load the specified XML file into the 'settings' variable:
xml.file <- system.file("extdata", "settings", "B_Taq_PCR_evaluate.xml",
package = "openPrimeR")
# Load the specified XML file into the 'settings' variable:
xml.file <- system.file("extdata", "settings", "B_Taq_PCR_evaluate.xml",
package = "openPrimeR")
settings <- read_settings(xml.file)
xml.file <- system.file("extdata", "settings", "B_Taq_PCR_evaluate.xml",
package = "openPrimeR")
settings <- read_settings(xml.file)
Let’s take a look at the structure of settings
:
print(settings)
As you can see, the DesignSettings
object contains far more information than just the constraint settings, but let’s focus on the constraints first.
You can change which physicochemical properties are considered and the desired range of property values by using the constraints()
function:
# View the constraints specified in 'settings':
constraints(settings)
The constraints()
function returns a named list where the names provide the identifiers of the active constraints and the constraints are specified as named vectors containing the entries min
and max
where min
indicates the smallest allowed value and max
indicates the maximal allowed value of a property. If either min
or max
is missing, this indicates that the corresponding range is unlimited. For example, the extent of the GC clamp should be between 1 and 3, while the coverage of every primer should be at least 1 (there’s no upper limit).
You can customize the constraint settings by using constraints()
as a setter. Why don’t you try to exclude the GC clamp property from consideration and increase the required number of coverage events per primer to 5? Note that you always have to provide named (min
and/or max
) numeric vectors when modifying constraints.
# Remove the GC clamp constraint and set the minimal primer coverage to 5
# Remove the GC clamp constraint and set the minimal primer coverage to 5
constraints(settings) <- constraints(settings)[names(constraints(settings)) != "gc_clamp"]
constraints(settings)$primer_coverage <- c("min" = 5)
constraints(settings)
xml.file <- system.file("extdata", "settings", "B_Taq_PCR_evaluate.xml",
package = "openPrimeR")
settings <- read_settings(xml.file)
constraints(settings) <- constraints(settings)[names(constraints(settings)) != "gc_clamp"]
constraints(settings)$primer_coverage <- c("min" = 5)
The coverage constraints determine under which circumstances a primer is considered to cover a template, which means that the primer is likely to successfully amplify the corresponding template. Depending on the selected coverage constraints, the estimate of amplified templates can be more sensitive (e.g. free energy of annealing) or more specific (modeling terminal mismatches). You can modify the coverage conditions using cvg_constraints()
:
# View the active coverage constraints
# View the active coverage constraints
cvg_constraints(settings)
We see that the maximal false positive rate for calling coverage events is set to 5%. This means that only coverage events whose estimated probability of being false positives is below 5% are retained, while all other events are removed. stop_codon
and substitution
are codon design constraints, where 1 denominates coverage events inducing stop codons or substitutions and 0 indicates the absence of such events. Since the maxima of both constraints are set to 1, we do not filter coverage events according to these events at the moment. To learn about other possible coverage constraints, we refer to the openPrimeR manual or the output of viewing the DesignSettings
object, which indicates the inactive constraints.
The constraint options define additional options for the computation of some constraints. Let us now review the active constraint options via conOptions()
:
# View the active constraint options
# View the active constraint options
conOptions(settings)
In the constraint options, allowed_mismatches
provides the maximal number of mismatches between primers and templates, allowed_other_binding_ratio
sets the maximal ratio of off-target binding events, and allowed_region_definition
determines whether primers are required to bind within the specified binding region or may also only overlap with the target region.
Please specify the following settings now:
allowed_mismatches
to 0 to ensure that only fully complementary primers are considered to cover the templatesallowed_other_binding_ratio
to 0 to ensure that only primers binding to the target region are considered to cover the templatesallowed_region_definition
to any
to consider the coverage of primers that only overlap with the target region# Modify the constraint options as suggested
conOptions(settings)$allowed_mismatches <- 0
conOptions(settings)$allowed_other_binding_ratio <- 0
conOptions(settings)$allowed_region_definition <- "any"
conOptions(settings)
xml.file <- system.file("extdata", "settings", "B_Taq_PCR_evaluate.xml",
package = "openPrimeR")
settings <- read_settings(xml.file)
constraints(settings) <- constraints(settings)[names(constraints(settings)) != "gc_clamp"]
constraints(settings)$primer_coverage <- c("min" = 5)
conOptions(settings)$allowed_mismatches <- 0
conOptions(settings)$allowed_other_binding_ratio <- 0
conOptions(settings)$allowed_region_definition <- "any"
Last, we should take a look at the current PCR conditions using PCR()
:
# Investigate the PCR conditions
# Investigate the PCR conditions
PCR(settings)
When performing an analysis, please ensure that all ion concentrations (molar) and other PCR conditions (e.g. the annealing temperature) reflect your experimental setup correctly. We will retain the default settings and just note that the PCR settings can be adjusted analogously to the other settings.
xml.file <- system.file("extdata", "settings", "B_Taq_PCR_evaluate.xml",
package = "openPrimeR")
settings <- read_settings(xml.file)
constraints(settings) <- constraints(settings)[names(constraints(settings)) != "gc_clamp"]
constraints(settings)$primer_coverage <- c("min" = 5)
conOptions(settings)$allowed_mismatches <- 0
conOptions(settings)$allowed_other_binding_ratio <- 0
conOptions(settings)$allowed_region_definition <- "any"
fasta.file <- system.file("extdata", "IMGT_data", "templates",
"Homo_sapiens_IGH_functional_exon.fasta", package = "openPrimeR")
hdr.structure <- c("ACCESSION", "GROUP", "SPECIES", "FUNCTION")
delim <- "|"
template.df <- read_templates(fasta.file, hdr.structure = hdr.structure,
delim = "|")
leader.fasta <- system.file("extdata", "IMGT_data", "templates",
"Homo_sapiens_IGH_functional_leader.fasta", package = "openPrimeR")
template.df <- assign_binding_regions(template.df, fw = leader.fasta, rev = NULL)
template.df <- adjust_binding_regions(template.df,
c(-max(template.df$Allowed_End_fw), 0), NULL)
fasta.file <- system.file("extdata", "IMGT_data", "primers",
"IGHV", "Tiller2008_1st.fasta", package = "openPrimeR")
primer.df <- read_primers(fasta.file)
To learn more about the properties of the primers, we can use check_constraints()
. In this part of the tutorial, we will specifically deal with the coverage of the templates that is afforded by the primers. To analyze only the coverage, you can supply primer_coverage
to the active.constraints
argument of check_constraints()
:
# Evaluate the primer coverage and store the results in 'constraint.df'
# Evaluate the primer coverage and store the results in 'constraint.df'
constraint.df <- check_constraints(primer.df, template.df, settings, active.constraints = "primer_coverage")
xml.file <- system.file("extdata", "settings", "B_Taq_PCR_evaluate.xml",
package = "openPrimeR")
settings <- read_settings(xml.file)
constraints(settings) <- constraints(settings)[names(constraints(settings)) != "gc_clamp"]
constraints(settings)$primer_coverage <- c("min" = 5)
conOptions(settings)$allowed_mismatches <- 0
conOptions(settings)$allowed_other_binding_ratio <- 0
conOptions(settings)$allowed_region_definition <- "any"
fasta.file <- system.file("extdata", "IMGT_data", "templates",
"Homo_sapiens_IGH_functional_exon.fasta", package = "openPrimeR")
hdr.structure <- c("ACCESSION", "GROUP", "SPECIES", "FUNCTION")
delim <- "|"
template.df <- read_templates(fasta.file, hdr.structure = hdr.structure,
delim = "|")
leader.fasta <- system.file("extdata", "IMGT_data", "templates",
"Homo_sapiens_IGH_functional_leader.fasta", package = "openPrimeR")
template.df <- assign_binding_regions(template.df, fw = leader.fasta, rev = NULL)
template.df <- adjust_binding_regions(template.df,
c(-max(template.df$Allowed_End_fw), 0), NULL)
fasta.file <- system.file("extdata", "IMGT_data", "primers",
"IGHV", "Tiller2008_1st.fasta", package = "openPrimeR")
primer.df <- read_primers(fasta.file)
constraint.df <- check_constraints(primer.df, template.df, settings, active.constraints = "primer_coverage")
Please investigate the structure of constraint.df
. The column primer_coverage
provides the number of covered templates and the column Covered_Seqs
gives comma-separated strings with the identifiers of the covered templates. Let’s try to find the primer with the highest coverage as well as the template sequences that are covered by the primer:
# Investigate the structure of the primers and then find the primer with the highest coverage and the templates that it covers
asS3(constraint.df)
max.idx <- which.max(constraint.df$primer_coverage)
max.ID <- primer.df$ID[max.idx]
print(max.ID)
covered.templates.id <- strsplit(constraint.df$Covered_Seqs[max.idx], split = ",")[[1]]
covered.templates <- template.df$ID[match(covered.templates.id, template.df$Identifier)]
print(covered.templates)
Great! Let’s visualize which templates are covered by the primers using plot_template_cvg()
.
plot_template_cvg(constraint.df, template.df)
In the plot, Identity Coverage
indicates the coverage when requiring full complementarity, while Expected Coverage
provides the coverage when applying the coverage constraints. Since we didn’t allow for any mismatches, both coverage values are basically identical. Available Templates
provides the number of template sequences per group. From the plot we can see that, when we don’t allow for any mismatches, about 47% of the templates are covered by the primers and that IGHV3, IGHV4, IGHV5, and IGHV7 are (partially) covered. To find out which primer amplifies which template groups, we can use plot_primer_cvg
:
# Plot the primer coverage
plot_primer_cvg(constraint.df, template.df)
This plot reveals that each primer binds with 100% complementarity only to individual groups of templates; note that the primer VH_1
targets only IGHV7, which consists of a single template sequence.
Our analysis was extremely conservative, since we didn’t consider mismatch binding events. To more accurately estimate the coverage of the primers, let’s ramp up the number of considered template-primer binding events by allowing to 7 mismatches using conOptions()
:
# Increase the number of allowed mismatches to 7
conOptions(settings)$allowed_mismatches <- 7
xml.file <- system.file("extdata", "settings", "B_Taq_PCR_evaluate.xml",
package = "openPrimeR")
settings <- read_settings(xml.file)
constraints(settings) <- constraints(settings)[names(constraints(settings)) != "gc_clamp"]
constraints(settings)$primer_coverage <- c("min" = 5)
conOptions(settings)$allowed_mismatches <- 7
conOptions(settings)$allowed_other_binding_ratio <- 0
conOptions(settings)$allowed_region_definition <- "any"
fasta.file <- system.file("extdata", "IMGT_data", "templates",
"Homo_sapiens_IGH_functional_exon.fasta", package = "openPrimeR")
hdr.structure <- c("ACCESSION", "GROUP", "SPECIES", "FUNCTION")
delim <- "|"
template.df <- read_templates(fasta.file, hdr.structure = hdr.structure,
delim = "|")
leader.fasta <- system.file("extdata", "IMGT_data", "templates",
"Homo_sapiens_IGH_functional_leader.fasta", package = "openPrimeR")
template.df <- assign_binding_regions(template.df, fw = leader.fasta, rev = NULL)
template.df <- adjust_binding_regions(template.df,
c(-max(template.df$Allowed_End_fw), 0), NULL)
fasta.file <- system.file("extdata", "IMGT_data", "primers",
"IGHV", "Tiller2008_1st.fasta", package = "openPrimeR")
primer.df <- read_primers(fasta.file)
constraint.df <- check_constraints(primer.df, template.df, settings, active.constraints = "primer_coverage")
Let’s re-analyze the coverage with the changed settings using check_constraints()
:
# Compute the coverage again
constraint.df <- check_constraints(primer.df, template.df, settings, active.constraints = "primer_coverage")
Let’s visualize the template and primer coverage again to identify how the coverage has changed:
# Plot the template coverage and the primer coverage
plot_template_cvg(constraint.df, template.df)
plot_primer_cvg(constraint.df, template.df)
The new results are impressively different to the previous ones. When we allow for more mismatches, quite a large percentage of templates are estimated to be covered and even the VH_1
primer is revealed to cover multiple template groups at the same time. We can take a closer look at the distribution of coverage events occur ING for different numbers of mismatches between primers and templates by supplying the boolean per.mismatch
argument to the two plotting functions:
# Plot the template coverage and the primer coverage, stratified by mismatches
plot_template_cvg(constraint.df, template.df, per.mismatch = TRUE)
plot_primer_cvg(constraint.df, template.df, per.mismatch = TRUE)
These plots reveal that allowing for only 1 mismatch already provides more than 50% coverage and that most coverage events of VH_1
occur with at least 6 mismatches.
Note that the estimated coverage takes into account only the properties of the primers that are directly associated with binding to a template (e.g. free energy and mismatches). Of course, there are many more properties that can influence whether an amplification is successful or not.
For example, if a primer forms a complex with another primer in a multiplex reaction, this may greatly reduce product yields. Therefore, the estimated coverage should be used as an indicator for the coverage can be achieved if the other properties of the primers are reasonable. To determine if this is the case, we will analyze these properties in the next part of the tutorial.
The physicochemical properties of the primers is facilitated by check_cvg_constraints()
. Since we’ve already analyzed the primer coverage, we will compute all remaining constraints now by passing the corresponding active.constraints
character vector to check_cvg_constraints()
:
xml.file <- system.file("extdata", "settings", "B_Taq_PCR_evaluate.xml",
package = "openPrimeR")
settings <- read_settings(xml.file)
constraints(settings) <- constraints(settings)[names(constraints(settings)) != "gc_clamp"]
constraints(settings)$primer_coverage <- c("min" = 5)
conOptions(settings)$allowed_mismatches <- 7
conOptions(settings)$allowed_other_binding_ratio <- 0
conOptions(settings)$allowed_region_definition <- "any"
fasta.file <- system.file("extdata", "IMGT_data", "templates",
"Homo_sapiens_IGH_functional_exon.fasta", package = "openPrimeR")
hdr.structure <- c("ACCESSION", "GROUP", "SPECIES", "FUNCTION")
delim <- "|"
template.df <- read_templates(fasta.file, hdr.structure = hdr.structure,
delim = "|")
leader.fasta <- system.file("extdata", "IMGT_data", "templates",
"Homo_sapiens_IGH_functional_leader.fasta", package = "openPrimeR")
template.df <- assign_binding_regions(template.df, fw = leader.fasta, rev = NULL)
template.df <- openPrimeR:::adjust_binding_regions(template.df,
c(-max(template.df$Allowed_End_fw), 0), NULL)
fasta.file <- system.file("extdata", "IMGT_data", "primers",
"IGHV", "Tiller2008_1st.fasta", package = "openPrimeR")
primer.df <- read_primers(fasta.file)
constraint.df <- check_constraints(primer.df, template.df, settings, active.constraints = "primer_coverage")
# Check all constraints except for 'primer_coverage'
constraint.df <- check_constraints(constraint.df, template.df, settings,
active.constraints = setdiff(names(constraints(settings)), "primer_coverage"))
asS3(constraint.df)
You can access the values of the computed properties in constraint.df
. Fields containing the keyword EVAL
indicate whether a primer passed or failed the constraints defined in settings
. Let’s try to access the computed melting temperatures and verify that the temperatures are in the desired range:
xml.file <- system.file("extdata", "settings", "B_Taq_PCR_evaluate.xml",
package = "openPrimeR")
settings <- read_settings(xml.file)
constraints(settings) <- constraints(settings)[names(constraints(settings)) != "gc_clamp"]
constraints(settings)$primer_coverage <- c("min" = 5)
conOptions(settings)$allowed_mismatches <- 7
conOptions(settings)$allowed_other_binding_ratio <- 0
conOptions(settings)$allowed_region_definition <- "any"
fasta.file <- system.file("extdata", "IMGT_data", "templates",
"Homo_sapiens_IGH_functional_exon.fasta", package = "openPrimeR")
hdr.structure <- c("ACCESSION", "GROUP", "SPECIES", "FUNCTION")
delim <- "|"
template.df <- read_templates(fasta.file, hdr.structure = hdr.structure,
delim = "|")
leader.fasta <- system.file("extdata", "IMGT_data", "templates",
"Homo_sapiens_IGH_functional_leader.fasta", package = "openPrimeR")
template.df <- assign_binding_regions(template.df, fw = leader.fasta, rev = NULL)
template.df <- openPrimeR:::adjust_binding_regions(template.df,
c(-max(template.df$Allowed_End_fw), 0), NULL)
fasta.file <- system.file("extdata", "IMGT_data", "primers",
"IGHV", "Tiller2008_1st.fasta", package = "openPrimeR")
primer.df <- read_primers(fasta.file)
constraint.df <- check_constraints(primer.df, template.df, settings)
# Verify whether the melting temperatures are in the desired range
tm <- constraint.df$melting_temp
print(paste0("Melting temperatures: ", paste0(tm, collapse = ",")))
print(constraints(settings)$melting_temp_range)
idx <- which(constraint.df$EVAL_melting_temp_range)
print(tm[idx])
print(constraint.df[idx, "melting_temp"] <= constraints(settings)$melting_temp_range["max"] &
constraint.df[idx, "melting_temp"] >= constraints(settings)$melting_temp_range["min"])
Since analyzing the properties in this way is cumbersome, we’ll create a visualization showing which primers fulfill the constraints using plot_constraint_fulfillment()
:
xml.file <- system.file("extdata", "settings", "B_Taq_PCR_evaluate.xml",
package = "openPrimeR")
settings <- read_settings(xml.file)
constraints(settings) <- constraints(settings)[names(constraints(settings)) != "gc_clamp"]
constraints(settings)$primer_coverage <- c("min" = 5)
conOptions(settings)$allowed_mismatches <- 7
conOptions(settings)$allowed_other_binding_ratio <- 0
conOptions(settings)$allowed_region_definition <- "any"
fasta.file <- system.file("extdata", "IMGT_data", "templates",
"Homo_sapiens_IGH_functional_exon.fasta", package = "openPrimeR")
hdr.structure <- c("ACCESSION", "GROUP", "SPECIES", "FUNCTION")
delim <- "|"
template.df <- read_templates(fasta.file, hdr.structure = hdr.structure,
delim = "|")
leader.fasta <- system.file("extdata", "IMGT_data", "templates",
"Homo_sapiens_IGH_functional_leader.fasta", package = "openPrimeR")
template.df <- assign_binding_regions(template.df, fw = leader.fasta, rev = NULL)
template.df <- openPrimeR:::adjust_binding_regions(template.df,
c(-max(template.df$Allowed_End_fw), 0), NULL)
fasta.file <- system.file("extdata", "IMGT_data", "primers",
"IGHV", "Tiller2008_1st.fasta", package = "openPrimeR")
primer.df <- read_primers(fasta.file)
constraint.df <- check_constraints(primer.df, template.df, settings)
# Plot the constraint fulfillment
plot_constraint_fulfillment(constraint.df, settings)
The plot quickly reveals that the main flaw of the primer set is associated with a high spread of primer melting temperatures. To quantify the deviations of the primer properties from the target constraints , we can use plot_constraint_deviation
:
# Plot the constraint deviation
plot_constraint_deviation(constraint.df, settings)
The plot shows that the constraint on the melting temperature deviations was violated most strongly, while some other constraints were violated only marginally. From these results we can conclude that, overall, the properties of the primers are reasonable. However, the high deviations in melting temperatures may require a relative low PCR annealing temperature and therefore lead to unspecific binding of the primers exhibiting high melting temperatures.
Note that we haven’t determined whether the GC clamp criterion is fulfilled since we’ve removed it at the beginning of the tutorial. Let’s determine the values for the GC clamp and redraw the plots for the constraints:
# Require 1 to 3 terminal GCs. Then determine the 'gc_clamp' property and plot the constraint fulfillment and deviation once again.
# require 1 to 3 terminal GCs
constraints(settings)$gc_clamp <- c("min" = 1, "max" = 3)
# update the evaluated primers
constraint.df <- check_constraints(constraint.df, template.df, settings, active.constraints = "gc_clamp")
# visualize the constraints
plot_constraint_fulfillment(constraint.df, settings)
plot_constraint_deviation(constraint.df, settings)
Great! Finally, we’ve really evaluated all constraints and there were no bad surprises when including the GC clamp condition.
Having finished our analysis of the constraints, we may want to store the results in some way. For this purpose, there are two options: storing evaluated data as a CSV file or creating a PDF report. Storing the results as a CSV file can be time-saving if you want to re-analyze the same data at a later point in time, while the PDF report can be used for filing the results. You can store primers as a CSV with write_primers()
and create a report with create_report()
.
At its core, comparing primer sets is based on the same functions as for analyzing individual primer sets. Therefore, we will now simply load several pre-evaluated sets of primers and their corresponding templates:
sel.sets <- c("Glas", "Rubinstein", "Persson", "Ippolito", "Scheid")
primer.files <- list.files(path = system.file("extdata", "IMGT_data", "comparison",
"primer_sets", "IGH", package = "openPrimeR"),
pattern = "*\\.csv", full.names = TRUE)
primer.data <- read_primers(primer.files)
sel.idx <- unlist(lapply(sel.sets, function(x) grep(x, names(primer.data))[1]))
primer.data <- primer.data[sel.idx]
template.files <- rep(system.file("extdata", "IMGT_data", "comparison", "templates",
"IGH_templates.csv", package = "openPrimeR"),
length(primer.data))
template.data <- read_templates(template.files)
To compare the coverage of the templates that is afforded by each primer set, we can use plot_template_cvg()
:
sel.sets <- c("Glas", "Rubinstein", "Persson", "Ippolito", "Scheid")
primer.files <- list.files(path = system.file("extdata", "IMGT_data", "comparison",
"primer_sets", "IGH", package = "openPrimeR"),
pattern = "*\\.csv", full.names = TRUE)
primer.data <- read_primers(primer.files)
sel.idx <- unlist(lapply(sel.sets, function(x) grep(x, names(primer.data))[1]))
primer.data <- primer.data[sel.idx]
template.files <- rep(system.file("extdata", "IMGT_data", "comparison", "templates",
"IGH_templates.csv", package = "openPrimeR"),
length(primer.data))
template.data <- read_templates(template.files)
# Plot the template coverage
plot_template_cvg(primer.data, template.data)
The plot shows that the primer sets from Glas and Ippolito outperform the other sets with respect to coverage. The sets from Person, Rubinstein, and Scheid seem to lack coverage of IGHV2. To gain a better understanding of the qualities of the individual primer sets, let’s analyze the distribution of coverage events with regard to mismatches:
# Plot the template coverage with mismatches
plot_template_cvg(primer.data, template.data, per.mismatch = TRUE)
From the distribution of binding events with regard to mismatches, we see that the Ippolito has a favorable performance for few mismatches since the set guarantees high coverages already without introducing many mutations into the sequences. While the sets from Scheid and Glas perform similarly well for few mismatches, the sets from Persson and Rubinstein achieve high coverages only with many mismatches.
Apart from the coverage, the number of primers may be of concern since unnecessarily large sets may increase costs and be subject to primer-primer interactions. We can analyze the relationship between the percentage of covered templates and the size of the primer sets through plot_cvg_vs_set_size()
:
# Plot coverage vs size of primer set
plot_cvg_vs_set_size(primer.data, template.data)
The plot reveals that the primer sets from Persson, Rubinstein, and Ippolito have acceptable sizes and that the sets from Scheid and Glas may be prohibitively large. The plot also reveals to which degree the sets fulfill the constraints as the radii of the points indicate the degree of constraint fulfillment. Since the radii of the sets are quite similar there doesn’t seem to be a big difference between the sets with respect to constraint fulfillment.
We’ll now investigate which constraints are unsatisfied by using plot_constraint_fulfillment()
and determine the extent to which the constraints are broken using plot_constraint_deviation()
:
library(openPrimeR)
xml.file <- system.file("extdata", "settings", "B_Taq_PCR_evaluate.xml",
package = "openPrimeR")
settings <- read_settings(xml.file)
# don't consider specificity: we don't want to interpret this here ..
constraints(settings) <- constraints(settings)[names(constraints(settings)) != "primer_specificity"]
sel.sets <- c("Glas", "Rubinstein", "Persson", "Ippolito", "Scheid")
primer.files <- list.files(path = system.file("extdata", "IMGT_data", "comparison",
"primer_sets", "IGH", package = "openPrimeR"),
pattern = "*\\.csv", full.names = TRUE)
primer.data <- read_primers(primer.files)
sel.idx <- unlist(lapply(sel.sets, function(x) grep(x, names(primer.data))[1]))
primer.data <- primer.data[sel.idx]
template.files <- rep(system.file("extdata", "IMGT_data", "comparison", "templates",
"IGH_templates.csv", package = "openPrimeR"),
length(primer.data))
template.data <- read_templates(template.files)
# Plot the constraint fulfillment and deviation
plot_constraint_fulfillment(primer.data, settings)
plot_constraint_deviation(primer.data, settings)
Looking at the results, we can make the following findings:
In summary, the primers from Persson fulfill the constraints extremely well. The sets from Glas, Rubinstein, and Scheid may be problematic due to the high melting temperature deviations.
Finally, let us consider the regions where the primers bind in the templates. Since we’ve allowed for off-target binding events it’s important to verify whether the primers bind close to the target region or not using plot_primer_binding_regions()
:
# Plot the binding positions of the primers
plot_primer_binding_regions(primer.data, template.data)
In this case, the blue region indicates the leader of the immunoglobulins, while the red region shows the variable exon region. Hence, the primers should start amplifying the templates at position 0 (the start of the exon) at the latest. However, this is not the case for all primer sets:
Considering all criteria, we would probably want to select the set from Ippolito for performing PCRs on the heavy chains of immunoglobulins due to the following reasons:
If you’d like to store the results of comparing the primer sets you can create a PDF report using create_report()
.
You can design novel primer sets via design_primers()
. Before you can start designing primers, you should perform the following steps:
You should already be familiar with steps 1 to 3 from the previous tutorial sections, so we will only quickly recapitulate these steps and then focus on using the priemr design function. In this part of the tutorial, we will design forward primers for the leaders of the heavy chain of human germline immunoglobulins and will set up the template sequences accordingly in the following.
We will load the functional human immunologublin heavy chain variable segments from a FASTA file with read_templates()
:
fasta.file <- system.file("extdata", "IMGT_data", "templates",
"Homo_sapiens_IGH_functional_exon.fasta", package = "openPrimeR")
# Load the template data
fasta.file <- system.file("extdata", "IMGT_data", "templates",
"Homo_sapiens_IGH_functional_exon.fasta", package = "openPrimeR")
# Load the template data
hdr.structure <- c("ACCESSION", "GROUP", "SPECIES", "FUNCTION")
template.df <- read_templates(fasta.file, hdr.structure = hdr.structure,
delim = "|")
fasta.file <- system.file("extdata", "IMGT_data", "templates",
"Homo_sapiens_IGH_functional_exon.fasta", package = "openPrimeR")
# Load the template data
hdr.structure <- c("ACCESSION", "GROUP", "SPECIES", "FUNCTION")
template.df <- read_templates(fasta.file, hdr.structure = hdr.structure,
delim = "|")
We will adjust the target binding regions uniformly across all templates with assign_binding_regions()
. This ensures that the primers are designed for the first positions of the leader:
# Specify the first 30 bases as the binding region for forward primers:
# Specify the first 30 bases as the binding region for forward primers:
template.df <- assign_binding_regions(template.df, fw = c(1,30))
Let’s load the default settings for designing primers via read_settings()
:
# Load default settings from a supplied XML file:
xml.file <- system.file("extdata", "settings", "A_Taq_PCR_design.xml",
package = "openPrimeR")
# Load default settings from a supplied XML file:
xml.file <- system.file("extdata", "settings", "A_Taq_PCR_design.xml",
package = "openPrimeR")
settings <- read_settings(xml.file)
# Load default settings from a supplied XML file:
xml.file <- system.file("extdata", "settings", "A_Taq_PCR_design.xml",
package = "openPrimeR")
settings <- read_settings(xml.file)
Having loaded the supplied settings, it is your job to verify the settings for your primer design task. Here is a list of considerations that may be relevant for you:
constraints()
.constraintLimits()
.cvg_constraints()
.conOptions()
.PCR()
.Possible fields for each slot of the settings object are described in the openPrimeR manual. Inactive options are shown when printing the settings object:
# Print the settings object to view the currently active settings, as well as the inactive settings
# Print the settings object to view the currently active settings, as well as other possible, inactive settings
print(settings)
The design_primers()
function consists of three phases:
The procedure for initializing a set of primers is controlled by the init.algo
argument. When set to naive
, primers are initialized by extracting substrings from the input templates. If init.algo
is set to tree
, degenerate primeres are created. The maximal degeneration can be controlled via the max.degen
argument.
The filtering procedure is most affected by the constraints that are provided in the settings
object passed to design_primers
. However, there are also other arguments that influence the filtering procedure. Most importantly, required.cvg
provides a numeric in the range [0,1] which indicates the desired coverage ratio of the templates. If the desired coverage ratio can’t be achieved because too many primers have been filtered, a relaxation procedure is initialized. This procedure adjusts the constraints such that more primer candidates are selected in order to reach the target coverage.
The selection of an optimal set of primers (i.e. solving the set cover problem) can be performed either by a greedy algorithm or an integer linear program. While the worst-case runtime of the greedy algorithm is less than that of the integer linear program, the integer linear program may be able to find a smaller set of optimal primers than the greedy algorithm. To use the greedy algorithm, you can set the opti.algo
argument to Greedy
and to use an integer linear program instead, you can set opti.algo
to ILP
.
Let’s design some primers targeting the leaders of the heavy chain immunologublins. Note that the primer design procedure needs some time to finish.
# Load templates
fasta.file <- system.file("extdata", "IMGT_data", "templates",
"Homo_sapiens_IGH_functional_exon.fasta", package = "openPrimeR")
hdr.structure <- c("ACCESSION", "GROUP", "SPECIES", "FUNCTION")
template.df <- read_templates(fasta.file, hdr.structure = hdr.structure,
delim = "|")
# Define binding regions
template.df <- assign_binding_regions(template.df, fw = c(1,30))
# Load settings
xml.file <- system.file("extdata", "settings", "A_Taq_PCR_design.xml",
package = "openPrimeR")
settings <- read_settings(xml.file)
# Modify settings
constraints(settings)$primer_length <- c("min" = 18, "max" = 18)
# Design forward primers (only for the first 5 templates to reduce the runtime) with naive initialization, and greedy optimization:
# Load templates
fasta.file <- system.file("extdata", "IMGT_data", "templates",
"Homo_sapiens_IGH_functional_exon.fasta", package = "openPrimeR")
hdr.structure <- c("ACCESSION", "GROUP", "SPECIES", "FUNCTION")
template.df <- read_templates(fasta.file, hdr.structure = hdr.structure,
delim = "|")
# Define binding regions
template.df <- assign_binding_regions(template.df, fw = c(1,30))
# Load settings
xml.file <- system.file("extdata", "settings", "A_Taq_PCR_design.xml",
package = "openPrimeR")
settings <- read_settings(xml.file)
# Modify settings
constraints(settings)$primer_length <- c("min" = 18, "max" = 18)
# Design forward primers (only for the first 5 templates) with naive initialization, and greedy optimization:
primer.df <- design_primers(template.df[1:5,], "fw", settings,
init.algo = "naive", opti.algo = "Greedy")
Excellent! Let’s verify the quality of the designed primers:
# Load templates
fasta.file <- system.file("extdata", "IMGT_data", "templates",
"Homo_sapiens_IGH_functional_exon.fasta", package = "openPrimeR")
hdr.structure <- c("ACCESSION", "GROUP", "SPECIES", "FUNCTION")
template.df <- read_templates(fasta.file, hdr.structure = hdr.structure,
delim = "|")
# Define binding regions
template.df <- assign_binding_regions(template.df, fw = c(1,30))
# Load settings
xml.file <- system.file("extdata", "settings", "A_Taq_PCR_design.xml",
package = "openPrimeR")
settings <- read_settings(xml.file)
# Modify settings
constraints(settings)$primer_length <- c("min" = 18, "max" = 18)
# Design forward primers (only for the first 5 templates) with naive initialization, and greedy optimization:
set.seed(1)
target.templates <- template.df[sample(nrow(template.df), 10),]
design.data <- design_primers(target.templates, "fw", settings,
init.algo = "naive", opti.algo = "Greedy")
# Determine the size of the optimized set:
# Determine the coverage of the optimized set
# Visualize the constraints
# Which constraints were used for the design?
# Which primers passed the filtering procedure?
# Determine the size of the optimized set:
primer.df <- design.data$opti
asS3(primer.df)
# Determine the coverage of the optimized set
plot_template_cvg(primer.df, target.templates)
# Visualize the constraints
plot_constraint_deviation(primer.df, settings)
# Which constraints were used for the design?
design.data$used_constraints
# Which primers passed the filtering procedure?
design.data$filtered$data