Source code for teemi.design.fetch_sequences

#!/usr/bin/env python
# MIT License
# Copyright (c) 2024, Technical University of Denmark (DTU)
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

""" This part of the design module is used fetching sequences"""

from Bio import SeqIO
from Bio import Entrez
import requests as r
from io import StringIO

# intermine
# from __future__ import print_function
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord


[docs]def retrieve_sequences_from_ncbi( list_of_acc_numbers: list, out_file: str, db="protein" ): """Retrieves sequences from ncbi. Parameters ---------- list_of_acc_numbers: list list_of_acc_numbers such as: ['Q05001', 'Q1PQK4','Q9SB48' ,'AFX82679'] Returns ------- A fasta file with your sequences """ try: email = "youremail@gmail.com" out_handle = open(out_file, "w") for i in range(0, len(list_of_acc_numbers)): Entrez.email = email handle = Entrez.efetch( db=db, id=list_of_acc_numbers[i], rettype="fasta", retmode="text" ) out_handle.write(handle.read()) out_handle.close() except: print( "An exception occurred, please double-check your accession numbers or connection" )
[docs]def read_fasta_files(path): """Reads FASTA files. Parameters ---------- path: str path to the fasta file you want to read. Returns ------- list of Bio.SeqRecord.SeqRecord """ ncbi_hits = [] for seq_record in SeqIO.parse(path, format="fasta"): ncbi_hits.append(seq_record) return ncbi_hits
[docs]def read_genbank_files(path): """Reads single Genbank files. Parameters ---------- path: str path to the genbank file you want to read. Returns ------- list of Bio.SeqRecord.SeqRecord """ ncbi_hits = [] for seq_record in SeqIO.parse(path, format="gb"): ncbi_hits.append(seq_record) return ncbi_hits
[docs]def retrieve_sequences_from_PDB(query: list): """Retrieves sequences from PDB. Parameters ---------- query: list list of accession numbers in the form of strings Returns ------- list of Bio.SeqRecord.SeqRecord """ list_of_protein_seqs = [] for q in query: cID = q baseUrl = "http://www.uniprot.org/uniprot/" currentUrl = baseUrl + cID + ".fasta" response = r.post(currentUrl) cData = "".join(response.text) Seq = StringIO(cData) Protein_sequence = list(SeqIO.parse(Seq, "fasta")) list_of_protein_seqs.append(Protein_sequence) return list_of_protein_seqs
[docs]def fetch_promoter(promoter_name: str): from intermine.webservice import Service """Retrieves a yeast promoter sequence from intermine. Parameters ---------- promoter_name: str Returns ------- promoter sequence : str """ seq = "" service = Service("https://yeastmine.yeastgenome.org/yeastmine/service") query = service.new_query("Gene") query.add_view( "secondaryIdentifier", "symbol", "length", "flankingRegions.direction", "flankingRegions.sequence.length", "flankingRegions.sequence.residues", ) query.add_constraint("Gene", "LOOKUP", promoter_name, "S. cerevisiae", code="B") query.add_constraint("flankingRegions.direction", "=", "upstream", code="C") query.add_constraint("flankingRegions.distance", "=", "1.0kb", code="A") query.add_constraint("flankingRegions.includeGene", "=", "false", code="D") for row in query.rows(): seq = row["flankingRegions.sequence.residues"] return seq
[docs]def fetch_multiple_promoters(List_of_promoter_names: list): """Retrieves a yeast promoter sequence from intermine. Parameters ---------- List_of_promoter_names: list list of strings of promoter names fx : ['YAR035C-A', 'YGR067C', 'JEN1', 'YNR034W-A', 'ACH1'] Returns ------- list of Bio.SeqRecord.SeqRecord """ # #initializing LIST_OF_BIOrecord_objects = [] for i in range(0, len(List_of_promoter_names)): # fetching the seqs promoters_seq = SeqRecord(Seq(fetch_promoter(List_of_promoter_names[i]))) promoters_seq.name = str(List_of_promoter_names[i]) + " Promoter" promoters_seq.id = str(List_of_promoter_names[i]) promoters_seq.description = "Defined as being 1kb upstream of the TSS and fetched through Intermines API" # Append to list LIST_OF_BIOrecord_objects.append(promoters_seq) return LIST_OF_BIOrecord_objects