Source code for teemi.design.retrieve_gene_homologs

#!/usr/bin/env python
# MIT License
# Copyright (c) 2024, Technical University of Denmark (DTU)
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

""" This part of the design module is used fetching gene homologs"""


import re
from Bio import pairwise2
import pandas as pd
from dnachisel import DnaOptimizationProblem, EnforceGCContent, CodonOptimize
from typing import List
from Bio.SeqRecord import SeqRecord


[docs]def alignment_identity(query: list, reference: str) -> list: """Calculates percent identity between a reference and query(s). Parameters ---------- query : list list of Biopython Seqrecord objects reference : str Returns ------- list of percent identeties as floats """ alignment_score = [] for alignment in query: alignment_score.append( pairwise2.align.globalmx( reference.seq, alignment.seq, 1, 0, score_only=True ) / len(reference.seq) ) return alignment_score
[docs]def filter_blast_results( blast_record, E_VALUE_THRESH=0.4, LOWER_PROTEIN_IDENTITY_THRESH=0.1, UPPER__PROTEIN_IDENTITY_THRESH=1, show_alignment=False, ): Alignments_that_follow_our_criteria = [] # These are the ACC numbers # saving some of the metrics Names = [] Identity = [] E_value = [] Length = [] ACC_number = [] counter = 0 for alignment in blast_record.alignments: for hsp in alignment.hsps: LENGTH = alignment.length # IDENTITY = hsp.identities / hsp.align_length # not super accurate # We add the added reqirement that the sequence should be within the AA identeties if ( hsp.expect < E_VALUE_THRESH and IDENTITY > LOWER_PROTEIN_IDENTITY_THRESH and IDENTITY < UPPER__PROTEIN_IDENTITY_THRESH ): counter += 1 if show_alignment: print("\nAlignment#", counter) print("Name:", alignment.hit_def) print("Title:", alignment.title) print("Length:", alignment.length) print("E value:", hsp.expect) print("Query:", hsp.query[0:75] + "...") print("Match:", hsp.match[0:75] + "...") print("Subjt:", hsp.sbjct[0:75] + "...") print("Identitiy", "{:.2f}".format(IDENTITY)) print(alignment) # Saving the metrics we want into different lists Alignments_that_follow_our_criteria.append(alignment) Names.append(alignment.hit_def) Identity.append(IDENTITY) E_value.append(hsp.expect) Length.append(alignment.length) ACC_number.append(alignment.accession) if show_alignment: print("\nTOTAL HOMOLOGS", counter) data = list(zip(Names, Identity, E_value, Length, ACC_number)) df_alignments_that_follow_our_criteria = pd.DataFrame( data, columns=["Name", "Identity", "E_value", "Length", "ACC_number"] ) return df_alignments_that_follow_our_criteria
[docs]def codon_optimize_with_dnachisel( sequences: List[SeqRecord], lower_GC: float = 0.3, upper_GC: float = 0.7, species: str = None, codon_usage_table=None, window: int = 50, ) -> List[SeqRecord]: """Codon-optimize sequences with_dnachisel. Parameters ---------- sequences : list list of Bio.SeqRecord objects lower_GC : float the lowest GC content in the region of 50 bp upper_GC : float the highest GC content in the region of 50 bp species : str name of the species for which to optimize the sequence. examples: 'e_coli, s_cerevisiae, h_sapiens, c_elegans, b_subtilis, d_melanogaster check python_codon_tables for more info. codon_usage_table: a codon table following the structure of: {'*': {'TAA': 0.0, 'TAG': 0.0, 'TGA': 1.0},... Returns ------- list of codon optimized sequences for yeast """ if not species and not codon_usage_table: raise ValueError( "At least one of `species` and `codon_usage_table` must be specified." ) codon_optimized_seqs = [] # DEFINE THE OPTIMIZATION PROBLEM for seq in sequences: if species: problem = DnaOptimizationProblem( sequence=seq.seq, constraints=[ EnforceGCContent(mini=lower_GC, maxi=upper_GC, window=window) ], objectives=[CodonOptimize(species=species)], ) # SOLVE THE CONSTRAINTS, OPTIMIZE WITH RESPECT TO THE OBJECTIVE problem.resolve_constraints() problem.optimize() print(problem.constraints_text_summary()) print(problem.objectives_text_summary()) elif codon_usage_table: problem = DnaOptimizationProblem( sequence=seq.seq, constraints=[ EnforceGCContent(mini=lower_GC, maxi=upper_GC, window=window) ], objectives=[CodonOptimize(codon_usage_table=codon_usage_table)], ) # SOLVE THE CONSTRAINTS, OPTIMIZE WITH RESPECT TO THE OBJECTIVE problem.resolve_constraints() problem.optimize() print(problem.constraints_text_summary()) print(problem.objectives_text_summary()) # GET THE FINAL SEQUENCE AS ANNOTATED BIOPYTHON RECORDS) final_record = problem.to_record(with_sequence_edits=True) final_record.id = seq.id final_record.name = seq.name final_record.description = seq.description codon_optimized_seqs.append(final_record) return codon_optimized_seqs
[docs]def find_all_starts(seq): """Find the starting index of all start codons in a lowercase seq This function was made by Justin Bois : http://justinbois.github.io/. """ # Initialize array of indices of start codons starts = [] # Find index of first start codon (remember, find() returns -1 if not found) i = seq.find("atg") # Keep looking for subsequence incrementing starting point of search while i >= 0: starts.append(i) i = seq.find("atg", i + 1) return tuple(starts)
[docs]def find_first_in_register_stop(seq): """ Find first stop codon on lowercase seq that starts at an index that is divisible by three. This function was made by Justin Bois : http://justinbois.github.io/. """ # Compile regexes for stop codons regex_stop = re.compile("(taa|tag|tga)") # Stop codon iterator stop_iterator = regex_stop.finditer(seq) # Find next stop codon that is in register for stop in stop_iterator: if stop.end() % 3 == 0: return stop.end() # Return -1 if we failed to find a stop codon return -1
[docs]def all_orfs(seq): """Return all ORFs of a sequence. This function was made by Justin Bois : http://justinbois.github.io/.""" # Make sure sequence is all lower case seq = seq.lower() # Find the indices of all start codons start_inds = find_all_starts(seq) # Keep track of stops stop_inds = [] # Initialze ORFs. Each entry in list is [ORF length, ORF start, ORF stop] orfs = [] # For each start codon, find the next stop codon in register for start in start_inds: relative_stop = find_first_in_register_stop(seq[start:]) if relative_stop != -1: # Index of stop codon stop = start + relative_stop # If already had stop, a longer ORF contains this one if stop not in stop_inds: orfs.append((relative_stop, start, stop)) stop_inds.append(stop) # Get sorted list of ORF length orfs = sorted(orfs, reverse=True) # Remove lengths for i, orf in enumerate(orfs): orfs[i] = (orf[1], orf[2]) return tuple(orfs)
[docs]def longest_orf(seq, n=1): """Longest ORF of a sequence. This function was made by Justin Bois : http://justinbois.github.io/.""" orfs = all_orfs(seq) if len(orfs) == 0: return "" elif n == 1 or len(orfs) == 1: return seq[orfs[0][0] : orfs[0][1]] else: return_list = [] for i in range(min(n, len(orfs))): return_list.append(seq[orfs[i][0] : orfs[i][1]]) return tuple(return_list)