Source code for teemi.design.combinatorial_design

#!/usr/bin/env python
# MIT License
# Copyright (c) 2024, Technical University of Denmark (DTU)
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

""" This part of the design module is used for making combinatorial libraries from DNA fragments."""

# standard libraries
import itertools
import numpy as np
import pandas as pd

# Pydna for the molecular bio
from pydna.design import primer_design
from pydna.design import assembly_fragments
from pydna.assembly import Assembly
from pydna.tm import tm_default as _tm_default
from pydna.dseqrecord import Dseqrecord

# Typing
from typing import List, Callable


[docs]class DesignAssembly: """Make a combinatorial library from DNA fragments. Parameters ---------- list_of_seqs : List[List[Dseqrecord]] A list of a constructs of choice. list_of_pads : List[Dseqrecord] A nucleotide sequence to be incorporated into the primers (Max is 40 bp) positions_of_pads : List[int] the position in the list of seqs where the pad is incorporated (zero indexed) Returns ------- DesignAssembly object A powerful class and a lot of information can be retrieved. Such as: showing all the amplicons needed to construct a combinatorial library with the simple method --> pcr_list_to_dataframe or primer_list_to_dataframe. """ def __init__( self, list_of_seqs: List[List[Dseqrecord]], list_of_pads: List[List[Dseqrecord]], positions_of_pads: List[int], target_tm=55.0, limit=13, overlap=35, tm_func: Callable = _tm_default, ): ### 1.INITIALIZING ## self.list_of_seqs = list_of_seqs self.list_of_pads = list_of_pads self.positions_of_pads = positions_of_pads # Generate list of names from Dseqrecord objects self.list_of_names = [ [seq.name for seq in seq_list] for seq_list in list_of_seqs ] ### 2. Amplicons, primers, and their temperatures ( self.list_of_amplicons, self.list_of_amplicon_primers, self.list_of_amplicon_primer_temps, ) = simple_amplicon_maker( self.list_of_seqs, self.list_of_names, target_tm=target_tm, limit=limit, primer_tm_func=tm_func, ) # Systematic names self.systematic_names = get_systematic_names(self.list_of_seqs) ### 3. COMBINATORIAL LISTS self.combinatorial_list_of_amplicons = get_combinatorial_list( self.list_of_amplicons ) self.combinatorial_list_of_names = get_combinatorial_list(self.list_of_names) self.combinatorial_list_of_primer_tm = get_combinatorial_list( self.list_of_amplicon_primer_temps ) # Making the combinations into a list so we can insert PADS later (They are tuples at this stage, and insert doesnt work for tuples) for i in range(0, len(self.combinatorial_list_of_amplicons)): self.combinatorial_list_of_amplicons[i] = list( self.combinatorial_list_of_amplicons[i] ) #### 4. Adding PADS ### for i in range(0, len(self.combinatorial_list_of_amplicons)): for pads, pos in sorted( zip(self.list_of_pads, self.positions_of_pads), key=lambda x: x[1], reverse=True, ): self.combinatorial_list_of_amplicons[i].insert(pos, pads) ### 5. Assembling and making overlapping primers self.list_of_assemblies = assembly_maker( self.combinatorial_list_of_amplicons, overlap=overlap ) ### 6. GETTING all primers, annotating, adding features self.primers = get_primers( self.list_of_assemblies, self.combinatorial_list_of_names, self.combinatorial_list_of_primer_tm, ) ### 7. Getting Unique primers and re-annotating list_assemblies to get right names self.unique_primers = unique_primers(self.primers, self.list_of_assemblies) ### 8. Unique amplicons self.unique_amplicons = unique_amplicons(self.list_of_assemblies)
[docs] def show_contigs(self): """Returns a string of the contigs generated by the assembly""" for i in range(0, len(self.list_of_assemblies)): print("\nContig" + str(self.systematic_names[i])) for j in range(0, len(self.list_of_assemblies[i])): print("Template: ", self.list_of_assemblies[i][j].name[0:15]) return
[docs] def show_variants_lib_df(self): """Returns a dataframe of all the variants""" combinatorial_lib_variants_df = pd.DataFrame(self.combinatorial_list_of_names) systematic_names = self.systematic_names combinatorial_lib_variants_df["Systematic_name"] = systematic_names combinatorial_lib_variants_df["Variant"] = np.arange( len(combinatorial_lib_variants_df) ) return combinatorial_lib_variants_df
[docs] def primer_list(self): """Return the list of primers""" primer_list = [] for primers in self.unique_primers: primer_list.append(primers) return primer_list
[docs] def primer_list_to_dataframe(self): """Return a pandas dataframe with list of primers.""" df = pd.DataFrame(self.unique_primers) df.columns = [ "id", "anneals to", "sequence", "annealing temperature", "length", "price(DKK)", "description", "footprint", "len_footprint", ] return df
[docs] def pcr_list_to_dataframe(self): """Prints PCR_list into a pandas dataframe""" dataframe_list = [] for i in range(0, len(self.unique_amplicons)): lst = [ "PCR{number}".format(number=i + 1), self.unique_amplicons[i].name, self.unique_amplicons[i].forward_primer.id, self.unique_amplicons[i].reverse_primer.id, self.unique_amplicons[i].forward_primer.features, self.unique_amplicons[i].reverse_primer.features, ] dataframe_list.append(lst) df = pd.DataFrame(dataframe_list) df.columns = [ "pcr_number", "template", "forward_primer", "reverse_primer", "f_tm", "r_tm", ] return df
[docs]def get_combinatorial_list(input_list): """ Generates all possible combinations from a list of lists. Parameters ---------- input_list : list of lists The input list of lists for which all possible combinations are to be generated. Returns ------- combinations : list of tuples A list of tuples representing all possible combinations of the elements in the input list of lists. Example ------- >>> input_list = [[1, 2], ['a', 'b']] >>> combinations = get_combinatorial_list(input_list) >>> print(combinations) [(1, 'a'), (1, 'b'), (2, 'a'), (2, 'b')] """ combinations = list(itertools.product(*input_list)) return combinations
[docs]def get_systematic_names(parts_list: list) -> list: """Returns a list of list with systematic names i.e [1,1,1], [1,2,1]... etc Parameters ---------- parts_list: list of list can have any type within the list[list[any_type]] Returns ------- systematic_names list of tuples with the systematic names eg. [(1,1,1),(1,2,1)] """ # The number of parts of each fragment num_parts = [len(l) for l in parts_list] ### For naming the strains systematically ### basically making a list from the number of parts with indexes systematic_lists = [] temp_list = [] for parts in num_parts: for j in range(parts): temp_list.append(j + 1) systematic_lists.append(temp_list) temp_list = [] # Then we use itertools to make the right combinations systematic_names = list(itertools.product(*systematic_lists)) return systematic_names
[docs]def simple_amplicon_maker( list_of_seqs: list, list_of_names: list, target_tm=56.0, limit=13, primer_tm_func=_tm_default, ): """Creates amplicons, updates their names Parameters ---------- list_of_seqs : list[list[pydna.dseqrecord.Dseqrecord]] List of the pydna.dseqrecord import Dseqrecord elements u want to made into amplicons list_of_names : list[list[str]] provide names for the sequences since pydna changes their names to amplicon Returns ------- list_of_amplicons : list[pydna.amplicon.Amplicon] list with the pydna.amplicon.Amplicon objects that have been made list_of_amplicon_primers : list[list[(pydna.seq.Seq, pydna.seq.Seq)]] a list of all the generated primers in tuples where index0 = forward primer and index1=reverse primer. Both are pydna.seq.Seq objects list_of_amplicon_primer_temps : list[list[(float, float)]] a list of melting temperatures in tuples where index0 = forward primer melting temp and index1=reverse primer melting temp. """ # Start by making an empty list list_of_amplicons = [[] for i in range(len(list_of_seqs))] list_of_amplicon_primers = [[] for i in range(len(list_of_seqs))] list_of_amplicon_primer_temps = [[] for i in range(len(list_of_seqs))] # Then we calculate the primers with the primer_calc for i in range(0, len(list_of_seqs)): for j in range(0, len(list_of_seqs[i])): # Append Amplicons amplicons = primer_design( list_of_seqs[i][j], tm_func=primer_tm_func, target_tm=target_tm, limit=limit, ) # Updating names amplicons.name = list_of_names[i][j] list_of_amplicons[i].append(amplicons) # Save the primers primers = (amplicons.forward_primer.seq, amplicons.reverse_primer.seq) list_of_amplicon_primers[i].append(primers) # Save melting temps melting_temps = ( primer_tm_func(str(amplicons.forward_primer.seq)), primer_tm_func(str(amplicons.reverse_primer.seq)), ) list_of_amplicon_primer_temps[i].append(melting_temps) return list_of_amplicons, list_of_amplicon_primers, list_of_amplicon_primer_temps
[docs]def get_primers( assemblies: list, names: list, primer_temps: list, ): """Returns a list of ALL primers from the combinatorial library, updates names and what they anneal to. Parameters ---------- assemblies : list[list[pydna.amplicon.Amplicon]] names : list[(str)] primer_temps : list[(float, float),..)...] Returns ------- primers : list[list[[pydna.primer.Primer, pydna.primer.Primer]] All primers that have been made for all assemblies """ all_primers = [] for i, assembly in enumerate(assemblies): assembly_primers = [] for j, amplicon in enumerate(assembly): # Names amplicon.name = names[i][j] # Primers if j == 0: # START OF THE ASSEMBLY amplicon.forward_primer.description = f"Anneals to {amplicon.name}" amplicon.reverse_primer.description = ( f"Anneals to {amplicon.name}, overlaps to {assembly[j + 1].name}" ) elif j == len(assembly) - 1: # THE END OF THE ASSEMBLY amplicon.forward_primer.description = ( f"Anneals to {amplicon.name}, overlaps to {assembly[j - 1].name}" ) amplicon.reverse_primer.description = f"Anneals to {amplicon.name}" else: # THE rest: amplicon.forward_primer.description = ( f"Anneals to {amplicon.name}, overlaps to {assembly[j - 1].name}" ) amplicon.reverse_primer.description = ( f"Anneals to {amplicon.name}, overlaps to {assembly[j + 1].name}" ) # template it anneals to amplicon.forward_primer.name = amplicon.name amplicon.reverse_primer.name = amplicon.name # Primer tm amplicon.forward_primer.features = round(float(primer_temps[i][j][0]), 2) amplicon.reverse_primer.features = round(float(primer_temps[i][j][1]), 2) assembly_primers.append([amplicon.forward_primer, amplicon.reverse_primer]) all_primers.append(assembly_primers) return all_primers
[docs]def assembly_maker(combinatorial_list_of_amplicons: list, overlap=35): """Assembles Amplicons with pad and makes new overlapping primers. Parameters ---------- combinatorial_list_of_amplicons : list[[pydna.amplicon.Amplicon]] the list of pydna.amplicon.Amplicon that you want generate overlapping primers for. overlap : int (default set to 35) How many basepair overlaps Returns ------- List_of_assemblies : list[[pydna.amplicon.Amplicon]] amplicons that overlaps eachother with the specified overlap value. """ list_of_assemblies = [] for i in range(0, len(combinatorial_list_of_amplicons)): list_of_assemblies.append( assembly_fragments(combinatorial_list_of_amplicons[i], overlap, maxlink=40) ) return list_of_assemblies
[docs]def unique_primers(primers: list, list_of_assemblies): """Finds unique primers from a list of assemblies Parameters ---------- primers : list[list[list[pydna.primer.Primer]]] a list of all the primers made for the combinatorial library list_of_assemblies: list[[pydna.amplicon.Amplicon]] used here to update the names of the primers Returns ------- unique_primers : list[list(ID,Anneals_to,Sequence,Annealing_temp,Length,Price(DKK))] Relevant metrics for the unique primers of the combinatorial library. """ unique_primers = [] unique_sequences = [] primer_info = [] counter = 0 ### CHANGING THE NAMES OF THE PRIMERS for primer_group in primers: for primer_pair in primer_group: for primer in primer_pair: if primer.seq not in unique_sequences: unique_sequences.append(primer.seq) unique_primers.append(primer) primer.id = f"P{counter+1:03}" counter += 1 primer_info.append( [ primer.id, primer.name, primer.seq, primer.features, # anealing temp len(primer.seq), # length len(primer.seq) * 1.8, # price primer.description, # description primer.footprint, len(primer.footprint), ] ) ### Updating primer names and removing duplicates for assembly in list_of_assemblies: for amplicon in assembly: for primer in unique_primers: if amplicon.forward_primer.seq == primer.seq: amplicon.forward_primer = primer elif amplicon.reverse_primer.seq == primer.seq: amplicon.reverse_primer = primer return primer_info
[docs]def unique_amplicons(list_of_assemblies: list): """Finds Unique amplicons from a list of assemblies Parameters ---------- list_of_assemblies: list[[pydna.amplicon.Amplicon]] list of the combinatorial libarary with overlapping ends Returns ------- unique_amplicons: list[pydna.amplicon.Amplicon] returns a list of unique amplicons where relavant metrics are added to the objects. """ ### Unique amplicons unique_amplicons = [] for i in range(0, len(list_of_assemblies)): for j in range(0, len(list_of_assemblies[i])): if list_of_assemblies[i][j] not in unique_amplicons: unique_amplicons.append(list_of_assemblies[i][j]) return unique_amplicons
[docs]def get_assembly_figure(assembly_list, limit=15): """ Generates a figure for the specified assembly in the assembly list. Parameters ---------- assembly_list : list The list of assemblies. limit : int, optional The limit for the assembly, by default 15. Returns ------- contig The figure for the specified assembly. """ assembly_obj = Assembly(assembly_list, limit=limit) contig = assembly_obj.assemble_linear()[0].figure() return contig
[docs]def count_unique_parts(predictions_df, max_combinations): """ Iterates through the DataFrame of predictions and saves newly encountered parts. Parameters ---------- predictions_df : pd.DataFrame DataFrame containing predictions. max_combinations : int The maximum number of combinations to consider. Returns ------- encountered_parts : dict A dictionary containing the unique parts encountered in 'G8H','pG8H', 'pCPR', 'CPR' columns, total number of unique combinations encountered in 'Sum of parts' and total predictions encountered in 'Predictions'. """ # Initialization encountered_parts = { "G8H": [], "pG8H": [], "pCPR": [], "CPR": [], "sum_of_parts": "", "prediction_number": "", } sum_of_parts = 0 prediction_index = 0 g8h_count = 0 cpr_count = 0 pg8h_count = 0 pcpr_count = 0 # Loop through the predictions and save new parts. while sum_of_parts < max_combinations: sum_of_parts = g8h_count * cpr_count * pg8h_count * pcpr_count encountered_parts["sum_of_parts"] = str(sum_of_parts) encountered_parts["prediction_number"] = str(prediction_index) g8h = predictions_df.G8H[prediction_index] pg8h = predictions_df.pG8H[prediction_index] cpr = predictions_df.CPR[prediction_index] pcpr = predictions_df.pCPR[prediction_index] if g8h not in encountered_parts["G8H"]: encountered_parts["G8H"].append(g8h) g8h_count += 1 if pg8h not in encountered_parts["pG8H"]: encountered_parts["pG8H"].append(pg8h) pg8h_count += 1 if cpr not in encountered_parts["CPR"]: encountered_parts["CPR"].append(cpr) cpr_count += 1 if pcpr not in encountered_parts["pCPR"]: encountered_parts["pCPR"].append(pcpr) pcpr_count += 1 prediction_index += 1 return encountered_parts