#!/usr/bin/env python
# MIT License
# Copyright (c) 2024, Technical University of Denmark (DTU)
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
""" This part of the design module is used for making combinatorial libraries from DNA fragments."""
# standard libraries
import itertools
import numpy as np
import pandas as pd
# Pydna for the molecular bio
from pydna.design import primer_design
from pydna.design import assembly_fragments
from pydna.assembly import Assembly
from pydna.tm import tm_default as _tm_default
from pydna.dseqrecord import Dseqrecord
# Typing
from typing import List, Callable
[docs]class DesignAssembly:
"""Make a combinatorial library from DNA fragments.
Parameters
----------
list_of_seqs : List[List[Dseqrecord]]
A list of a constructs of choice.
list_of_pads : List[Dseqrecord]
A nucleotide sequence to be incorporated into the primers (Max is 40 bp)
positions_of_pads : List[int]
the position in the list of seqs where the pad is incorporated (zero indexed)
Returns
-------
DesignAssembly object
A powerful class and a lot of information can be retrieved.
Such as: showing all the amplicons needed to construct a combinatorial library
with the simple method --> pcr_list_to_dataframe or primer_list_to_dataframe.
"""
def __init__(
self,
list_of_seqs: List[List[Dseqrecord]],
list_of_pads: List[List[Dseqrecord]],
positions_of_pads: List[int],
target_tm=55.0,
limit=13,
overlap=35,
tm_func: Callable = _tm_default,
):
### 1.INITIALIZING ##
self.list_of_seqs = list_of_seqs
self.list_of_pads = list_of_pads
self.positions_of_pads = positions_of_pads
# Generate list of names from Dseqrecord objects
self.list_of_names = [
[seq.name for seq in seq_list] for seq_list in list_of_seqs
]
### 2. Amplicons, primers, and their temperatures
(
self.list_of_amplicons,
self.list_of_amplicon_primers,
self.list_of_amplicon_primer_temps,
) = simple_amplicon_maker(
self.list_of_seqs,
self.list_of_names,
target_tm=target_tm,
limit=limit,
primer_tm_func=tm_func,
)
# Systematic names
self.systematic_names = get_systematic_names(self.list_of_seqs)
### 3. COMBINATORIAL LISTS
self.combinatorial_list_of_amplicons = get_combinatorial_list(
self.list_of_amplicons
)
self.combinatorial_list_of_names = get_combinatorial_list(self.list_of_names)
self.combinatorial_list_of_primer_tm = get_combinatorial_list(
self.list_of_amplicon_primer_temps
)
# Making the combinations into a list so we can insert PADS later (They are tuples at this stage, and insert doesnt work for tuples)
for i in range(0, len(self.combinatorial_list_of_amplicons)):
self.combinatorial_list_of_amplicons[i] = list(
self.combinatorial_list_of_amplicons[i]
)
#### 4. Adding PADS ###
for i in range(0, len(self.combinatorial_list_of_amplicons)):
for pads, pos in sorted(
zip(self.list_of_pads, self.positions_of_pads),
key=lambda x: x[1],
reverse=True,
):
self.combinatorial_list_of_amplicons[i].insert(pos, pads)
### 5. Assembling and making overlapping primers
self.list_of_assemblies = assembly_maker(
self.combinatorial_list_of_amplicons, overlap=overlap
)
### 6. GETTING all primers, annotating, adding features
self.primers = get_primers(
self.list_of_assemblies,
self.combinatorial_list_of_names,
self.combinatorial_list_of_primer_tm,
)
### 7. Getting Unique primers and re-annotating list_assemblies to get right names
self.unique_primers = unique_primers(self.primers, self.list_of_assemblies)
### 8. Unique amplicons
self.unique_amplicons = unique_amplicons(self.list_of_assemblies)
[docs] def show_contigs(self):
"""Returns a string of the contigs generated by the assembly"""
for i in range(0, len(self.list_of_assemblies)):
print("\nContig" + str(self.systematic_names[i]))
for j in range(0, len(self.list_of_assemblies[i])):
print("Template: ", self.list_of_assemblies[i][j].name[0:15])
return
[docs] def show_variants_lib_df(self):
"""Returns a dataframe of all the variants"""
combinatorial_lib_variants_df = pd.DataFrame(self.combinatorial_list_of_names)
systematic_names = self.systematic_names
combinatorial_lib_variants_df["Systematic_name"] = systematic_names
combinatorial_lib_variants_df["Variant"] = np.arange(
len(combinatorial_lib_variants_df)
)
return combinatorial_lib_variants_df
[docs] def primer_list(self):
"""Return the list of primers"""
primer_list = []
for primers in self.unique_primers:
primer_list.append(primers)
return primer_list
[docs] def primer_list_to_dataframe(self):
"""Return a pandas dataframe with list of primers."""
df = pd.DataFrame(self.unique_primers)
df.columns = [
"id",
"anneals to",
"sequence",
"annealing temperature",
"length",
"price(DKK)",
"description",
"footprint",
"len_footprint",
]
return df
[docs] def pcr_list_to_dataframe(self):
"""Prints PCR_list into a pandas dataframe"""
dataframe_list = []
for i in range(0, len(self.unique_amplicons)):
lst = [
"PCR{number}".format(number=i + 1),
self.unique_amplicons[i].name,
self.unique_amplicons[i].forward_primer.id,
self.unique_amplicons[i].reverse_primer.id,
self.unique_amplicons[i].forward_primer.features,
self.unique_amplicons[i].reverse_primer.features,
]
dataframe_list.append(lst)
df = pd.DataFrame(dataframe_list)
df.columns = [
"pcr_number",
"template",
"forward_primer",
"reverse_primer",
"f_tm",
"r_tm",
]
return df
[docs]def get_combinatorial_list(input_list):
"""
Generates all possible combinations from a list of lists.
Parameters
----------
input_list : list of lists
The input list of lists for which all possible combinations are to be generated.
Returns
-------
combinations : list of tuples
A list of tuples representing all possible combinations of the elements in the input list of lists.
Example
-------
>>> input_list = [[1, 2], ['a', 'b']]
>>> combinations = get_combinatorial_list(input_list)
>>> print(combinations)
[(1, 'a'), (1, 'b'), (2, 'a'), (2, 'b')]
"""
combinations = list(itertools.product(*input_list))
return combinations
[docs]def get_systematic_names(parts_list: list) -> list:
"""Returns a list of list with systematic names i.e [1,1,1], [1,2,1]... etc
Parameters
----------
parts_list: list of list
can have any type within the list[list[any_type]]
Returns
-------
systematic_names
list of tuples with the systematic names eg. [(1,1,1),(1,2,1)]
"""
# The number of parts of each fragment
num_parts = [len(l) for l in parts_list]
### For naming the strains systematically ### basically making a list from the number of parts with indexes
systematic_lists = []
temp_list = []
for parts in num_parts:
for j in range(parts):
temp_list.append(j + 1)
systematic_lists.append(temp_list)
temp_list = []
# Then we use itertools to make the right combinations
systematic_names = list(itertools.product(*systematic_lists))
return systematic_names
[docs]def simple_amplicon_maker(
list_of_seqs: list,
list_of_names: list,
target_tm=56.0,
limit=13,
primer_tm_func=_tm_default,
):
"""Creates amplicons, updates their names
Parameters
----------
list_of_seqs : list[list[pydna.dseqrecord.Dseqrecord]]
List of the pydna.dseqrecord import Dseqrecord elements u want to made into amplicons
list_of_names : list[list[str]]
provide names for the sequences since pydna changes their names to amplicon
Returns
-------
list_of_amplicons : list[pydna.amplicon.Amplicon]
list with the pydna.amplicon.Amplicon objects that have been made
list_of_amplicon_primers : list[list[(pydna.seq.Seq, pydna.seq.Seq)]]
a list of all the generated primers in tuples where index0 = forward primer
and index1=reverse primer. Both are pydna.seq.Seq objects
list_of_amplicon_primer_temps : list[list[(float, float)]]
a list of melting temperatures in tuples where index0 = forward primer melting temp
and index1=reverse primer melting temp.
"""
# Start by making an empty list
list_of_amplicons = [[] for i in range(len(list_of_seqs))]
list_of_amplicon_primers = [[] for i in range(len(list_of_seqs))]
list_of_amplicon_primer_temps = [[] for i in range(len(list_of_seqs))]
# Then we calculate the primers with the primer_calc
for i in range(0, len(list_of_seqs)):
for j in range(0, len(list_of_seqs[i])):
# Append Amplicons
amplicons = primer_design(
list_of_seqs[i][j],
tm_func=primer_tm_func,
target_tm=target_tm,
limit=limit,
)
# Updating names
amplicons.name = list_of_names[i][j]
list_of_amplicons[i].append(amplicons)
# Save the primers
primers = (amplicons.forward_primer.seq, amplicons.reverse_primer.seq)
list_of_amplicon_primers[i].append(primers)
# Save melting temps
melting_temps = (
primer_tm_func(str(amplicons.forward_primer.seq)),
primer_tm_func(str(amplicons.reverse_primer.seq)),
)
list_of_amplicon_primer_temps[i].append(melting_temps)
return list_of_amplicons, list_of_amplicon_primers, list_of_amplicon_primer_temps
[docs]def get_primers(
assemblies: list,
names: list,
primer_temps: list,
):
"""Returns a list of ALL primers from the combinatorial library,
updates names and what they anneal to.
Parameters
----------
assemblies : list[list[pydna.amplicon.Amplicon]]
names : list[(str)]
primer_temps : list[(float, float),..)...]
Returns
-------
primers : list[list[[pydna.primer.Primer, pydna.primer.Primer]]
All primers that have been made for all assemblies
"""
all_primers = []
for i, assembly in enumerate(assemblies):
assembly_primers = []
for j, amplicon in enumerate(assembly):
# Names
amplicon.name = names[i][j]
# Primers
if j == 0: # START OF THE ASSEMBLY
amplicon.forward_primer.description = f"Anneals to {amplicon.name}"
amplicon.reverse_primer.description = (
f"Anneals to {amplicon.name}, overlaps to {assembly[j + 1].name}"
)
elif j == len(assembly) - 1: # THE END OF THE ASSEMBLY
amplicon.forward_primer.description = (
f"Anneals to {amplicon.name}, overlaps to {assembly[j - 1].name}"
)
amplicon.reverse_primer.description = f"Anneals to {amplicon.name}"
else: # THE rest:
amplicon.forward_primer.description = (
f"Anneals to {amplicon.name}, overlaps to {assembly[j - 1].name}"
)
amplicon.reverse_primer.description = (
f"Anneals to {amplicon.name}, overlaps to {assembly[j + 1].name}"
)
# template it anneals to
amplicon.forward_primer.name = amplicon.name
amplicon.reverse_primer.name = amplicon.name
# Primer tm
amplicon.forward_primer.features = round(float(primer_temps[i][j][0]), 2)
amplicon.reverse_primer.features = round(float(primer_temps[i][j][1]), 2)
assembly_primers.append([amplicon.forward_primer, amplicon.reverse_primer])
all_primers.append(assembly_primers)
return all_primers
[docs]def assembly_maker(combinatorial_list_of_amplicons: list, overlap=35):
"""Assembles Amplicons with pad and makes new overlapping primers.
Parameters
----------
combinatorial_list_of_amplicons : list[[pydna.amplicon.Amplicon]]
the list of pydna.amplicon.Amplicon that you want generate
overlapping primers for.
overlap : int (default set to 35)
How many basepair overlaps
Returns
-------
List_of_assemblies : list[[pydna.amplicon.Amplicon]]
amplicons that overlaps eachother with the specified overlap value.
"""
list_of_assemblies = []
for i in range(0, len(combinatorial_list_of_amplicons)):
list_of_assemblies.append(
assembly_fragments(combinatorial_list_of_amplicons[i], overlap, maxlink=40)
)
return list_of_assemblies
[docs]def unique_primers(primers: list, list_of_assemblies):
"""Finds unique primers from a list of assemblies
Parameters
----------
primers : list[list[list[pydna.primer.Primer]]]
a list of all the primers made for the combinatorial library
list_of_assemblies: list[[pydna.amplicon.Amplicon]]
used here to update the names of the primers
Returns
-------
unique_primers : list[list(ID,Anneals_to,Sequence,Annealing_temp,Length,Price(DKK))]
Relevant metrics for the unique primers of the combinatorial library.
"""
unique_primers = []
unique_sequences = []
primer_info = []
counter = 0
### CHANGING THE NAMES OF THE PRIMERS
for primer_group in primers:
for primer_pair in primer_group:
for primer in primer_pair:
if primer.seq not in unique_sequences:
unique_sequences.append(primer.seq)
unique_primers.append(primer)
primer.id = f"P{counter+1:03}"
counter += 1
primer_info.append(
[
primer.id,
primer.name,
primer.seq,
primer.features, # anealing temp
len(primer.seq), # length
len(primer.seq) * 1.8, # price
primer.description, # description
primer.footprint,
len(primer.footprint),
]
)
### Updating primer names and removing duplicates
for assembly in list_of_assemblies:
for amplicon in assembly:
for primer in unique_primers:
if amplicon.forward_primer.seq == primer.seq:
amplicon.forward_primer = primer
elif amplicon.reverse_primer.seq == primer.seq:
amplicon.reverse_primer = primer
return primer_info
[docs]def unique_amplicons(list_of_assemblies: list):
"""Finds Unique amplicons from a list of assemblies
Parameters
----------
list_of_assemblies: list[[pydna.amplicon.Amplicon]]
list of the combinatorial libarary with overlapping ends
Returns
-------
unique_amplicons: list[pydna.amplicon.Amplicon]
returns a list of unique amplicons where relavant metrics
are added to the objects.
"""
### Unique amplicons
unique_amplicons = []
for i in range(0, len(list_of_assemblies)):
for j in range(0, len(list_of_assemblies[i])):
if list_of_assemblies[i][j] not in unique_amplicons:
unique_amplicons.append(list_of_assemblies[i][j])
return unique_amplicons
[docs]def count_unique_parts(predictions_df, max_combinations):
"""
Iterates through the DataFrame of predictions and saves newly encountered parts.
Parameters
----------
predictions_df : pd.DataFrame
DataFrame containing predictions.
max_combinations : int
The maximum number of combinations to consider.
Returns
-------
encountered_parts : dict
A dictionary containing the unique parts encountered in 'G8H','pG8H', 'pCPR', 'CPR' columns,
total number of unique combinations encountered in 'Sum of parts' and total predictions
encountered in 'Predictions'.
"""
# Initialization
encountered_parts = {
"G8H": [],
"pG8H": [],
"pCPR": [],
"CPR": [],
"sum_of_parts": "",
"prediction_number": "",
}
sum_of_parts = 0
prediction_index = 0
g8h_count = 0
cpr_count = 0
pg8h_count = 0
pcpr_count = 0
# Loop through the predictions and save new parts.
while sum_of_parts < max_combinations:
sum_of_parts = g8h_count * cpr_count * pg8h_count * pcpr_count
encountered_parts["sum_of_parts"] = str(sum_of_parts)
encountered_parts["prediction_number"] = str(prediction_index)
g8h = predictions_df.G8H[prediction_index]
pg8h = predictions_df.pG8H[prediction_index]
cpr = predictions_df.CPR[prediction_index]
pcpr = predictions_df.pCPR[prediction_index]
if g8h not in encountered_parts["G8H"]:
encountered_parts["G8H"].append(g8h)
g8h_count += 1
if pg8h not in encountered_parts["pG8H"]:
encountered_parts["pG8H"].append(pg8h)
pg8h_count += 1
if cpr not in encountered_parts["CPR"]:
encountered_parts["CPR"].append(cpr)
cpr_count += 1
if pcpr not in encountered_parts["pCPR"]:
encountered_parts["pCPR"].append(pcpr)
pcpr_count += 1
prediction_index += 1
return encountered_parts