A comprehensive package of biological constants, serving as a foundational resource for biology and bioinformatics, complemented by functions to streamline related tasks.
Project description
Biobase
A Python package providing standardized biological constants and substitution matrices for bioinformatics pipelines. Biobase aims to eliminate the need to repeatedly recreate common biological data structures and scoring systems in your code.
Table of Contents
Quick Start
Access amino acid properties
from biobase.constants import ONE_LETTER_CODES, MONO_MASS, CODON_TABLE
print(ONE_LETTER_CODES) # 'ACDEFGHIKLMNPQRSTVWY'
print(MONO_MASS['A']) # 71.037113805
print(CODON_TABLE["AUG"]) # M
Use substitution matrices
from biobase.matrix import Blosum
blosum62 = Blosum(62)
print(blosum62['A']['A']) # 4
print(blosum62['W']['C']) # -2
from biobase.matrix import Pam
pam200 = Pam(200)
print(pam200['A']['A']) # 3
print(pam200['W']['C']) # -9
from biobase.matrix import Identity
identity0 = Identity(0)
print(identity0['A']['A']) # 1
print(identity0['W']['C']) # 0
from biobase.matrix import Match
match_mat = Match()
print(match_mat['A']['A']) # 1
print(match_mat['W']['C']) # -1
Analyse DNA sequences
from biobase.analysis import Dna
sequence = "ATCGTAGC"
print(Dna.complement(sequence)) # 'TAGCATCG'
print(Dna.complement(sequence, reverse=True)) # 'GCTACGAT'
print(Dna.transcribe(sequence)) # 'AUCGUAGC'
print(Dna.translate(sequence)) # 'IV'
print(Dna.calculate_gc_content(sequence)) # 50.0
print(Dna.calculate_at_content(sequence)) # 50.0
print(Dna.entropy(sequence)) # 2.0
seq = "ccatgccctaaatggggtag"
for start, end, orf in Dna.find_orfs(seq, include_seq=True)
print(start, end, orf)
# 2, 11, "ATGCCCTAA"
# 11, 20, "ATGGGGTAG"
Analyse Nucleotides
from biobase.analysis import Nucleotides
print(Nucleotides.molecular_weight("A")) # 135.13
print(Nucleotides.cumulative_molecular_weight("ATCG")) # 523.48
print(Nucleotides.translate("AUGUUGUCGCCUU")) # 'MLSP'
Find protein motifs
from biobase.analysis import find_motifs
sequence = "ACDEFGHIKLMNPQRSTVWY"
print(find_motifs(sequence, "DEF"))
# [(1, 4)]
test_dict = {
">SP001": "ACDEFCDEFCDEFGHIKLMN", # has matches for "CDE" that span indexes [(1, 4), (5, 8), (9, 12)]
">SP002": "MNPQRSTVWYACDEFGHIKL", # has match for "CDE" that span indexes [(11, 14)]
">SP003": "AAAAAAAAAAAAAAAAAA12", # invalid: contains "1", "2"
">SP004": "GGGGGGGGGGGGGGGGGGGG", # no match
">SP005": "HHHHHHHHHHHHHHHHH@#$", # invalid: contains "@", "#", "$"
">SP006": "DDDDDDDDDDDDDDDDDDDD", # no match
">SP007": "CDEFGHCDEFKLCDEFPQRS", # has matches for "CDE" that span indexes [(0, 3), (6, 9), (12, 15)]
">SP008": "LLLLLLLLLLLLLLLLLLLL", # no match
">SP009": "KKKKKKKKKKKK123KKKKK", # invalid: contains "1", "2", "3"
">SP010": "CDEACDEDCDEFAAAAAAAA", # has matches for "CDE" that span indexes [(0, 3), (4, 7), (8, 11)]
}
matched, invalid, non_match = find_motifs(test_dict, "CDE")
print("Matches:")
for seq, matches in matched.items():
print(f"{seq}")
print(f"{"".join([f"{match[0]} to {match[1]}\n" for match in matches])}")
print(f"Invalid sequences:\n{"".join([f"{seq}: {invs}\n" for seq, invs in invalid.items()])}")
print(f"Sequences without matches:\n{"".join([f"- {nm}\n" for nm in non_match])}")
# Matches:
# >SP001
# 1 to 4
# 5 to 8
# 9 to 12
# >SP002
# 11 to 14
# >SP007
# 0 to 3
# 6 to 9
# 12 to 15
# >SP010
# 0 to 3
# 4 to 7
# 8 to 11
# Invalid sequences:
# >SP003: {'2', '1'}
# >SP005: {'$', '@', '#'}
# >SP009: {'2', '1', '3'}
# Sequences without matches:
# - >SP004
# - >SP006
# - >SP008
Parse FASTA
Each entry is stored as a record
class FastaRecord:
def __init__(self, header, sequence) -> None:
self.id
self.name
self.seq
def __repr__(self) -> str:
def __str__(self) -> str:
def length(self) -> int:
Class and function to read fasta string
from biobase.parser import FastaParser, fasta_parser
fasta = """>CAA39742.1 cytochrome b (mitochondrion) [Sus scrofa]
MTNIRKSHPLMKIINNAFIDLPAPSNISSWWNFGSLLGICLILQILTGLFLAMHYTSDTTTAFSSVTHIC"""
# Class that yields generator
records = list(FastaParser(fasta))
# File parsing done with FastaFileParser(fasta_file_path)
r: FastaRecord = records[0]
print(r.id) # CAA39742.1
print(r.seq) # MTNIRKSHPLMKIINNAFIDLPAPSNISSWWNFGSLLGICLILQILTGLFLAMHYTSDTTTAFSSVTHIC
# Function that returns list
records = fasta_parser(fasta)
# File parsing done with fasta_file_parser(fasta_file_path)
for r in records:
print(r.id) # CAA39742.1
print(r.seq) # MTNIRKSHPLMKIINNAFIDLPAPSNISSWWNFGSLLGICLILQILTGLFLAMHYTSDTTTAFSSVTHIC
Parse FASTQ
Each entry is stored as a record
class FastqRecord:
def __init__(self, id: str, seq: str, separator: str, quality: str) -> None:
self.id: str
self.seq: str
self.separator: str
self.quality: str
def __repr__(self) -> str:
def __str__(self) -> str:
def length(self) -> int:
def convert_to_fasta(self) -> str:
def phred_scores(self) -> np.ndarray:
def average_quality(self) -> float:
from biobase.parser import FastqParser, fastq_parser
fastq = """@2fa9ee19-5c51-4281-abdd-eac86
CGGTAGCCAGCTGCGTTCAGTATG
+
%%%+++'''@@@???<<<??????"""
# Class that yields generator
records = list(FastqParser(fastq))
# File reading done with FastqFileParser(fastq_file_path)
r: FastqRecord = records[0]
print(r.id) # 2fa9ee19-5c51-4281-abdd-eac86
print(r.seq) # CGGTAGCCAGCTGCGTTCAGTATG
# Function that returns list
records = fastq_parser(fastq)
# File reading done with fastq_file_parser(fastq_file_path)
for r in records:
print(r.id) # 2fa9ee19-5c51-4281-abdd-eac86
print(r.seq) # CGGTAGCCAGCTGCGTTCAGTATG
class FastqFileParser(FastqParserBase):
def __init__(self, filepath: str) -> None:
self.filepath = filepath
def __iter__(self) -> Iterator[FastqRecord]:
def count_reads(self) -> int:
def filter_reads(self, min_avg_quality: float) -> Iterator[FastqRecord]:
def to_fasta(self) -> list[FastaRecord]:
def to_fasta_iter(self) -> Iterator[FastaRecord]:
def to_fasta_file(self, out_path: str) -> None:
def read_lengths(self) -> np.ndarray:
class FastqParser(FastqParserBase):
def __init__(self, reads: str) -> None:
self.reads = reads
def __iter__(self) -> Iterator[FastqRecord]:
def count_reads(self) -> int:
def filter_reads(self, min_avg_quality: float) -> Iterator[FastqRecord]:
def to_fasta(self) -> list[FastaRecord]:
def to_fasta_iter(self) -> Iterator[FastaRecord]:
def to_fasta_file(self, out_path: str) -> None:
def read_lengths(self) -> np.ndarray:
Parse Genbank
Each entry is parsed as a record
class GenBankRecord:
"""Represents a parsed GenBank record with entries"""
_entry_classes: dict[str, type] = {
"LOCUS": Locus,
"DEFINITION": Definition,
"ACCESSION": Accession,
"FEATURES": Features,
"ORIGIN": Origin,
"VERSION": Version,
}
def __init__(
self, entries: dict[str, Any], source_filepath: Path | None = None
) -> None:
self.id: str
self.seq: str
self.name: str
self.entries: dict[str, Any] # Dict of entry classes
self._source_filepath
def __repr__(self) -> str:
from biobase.parser import GenBankFileParser
"""
# GENBANK FILE CONTENTS
LOCUS ADF90000 50 bp DNA circular INV 01-JAN-2023
DEFINITION A test record.
ACCESSION ADF90000
VERSION ADF90000.1 GI:100000000
KEYWORDS second; test.
ORIGIN
1 cgatcggatc gattcggact ggatcgatcg atcggatcga tcggatcgga
//
"""
parser = GenBankFileParser(path_to_file)
records = list(parser)
r = records[0]
print(r.id) # ADF90000
print(r.seq) # cgatcggatcgattcggactggatcgatcgatcggatcgatcggatcgga
version = r.entries["VERSION"]
print(version.version) # ADF90000.1
# Entries
class Locus:
_MOLECULE_TYPE_LIST: list[str] = ["DNA", "RNA", "PROTEIN"]
def __init__(self, line: str) -> None:
self._raw_line: str
self._parts: list[str]
self.name: str
self.length: int
self.molecule_type: str
self.topology: str
self.date: str
self._set_info()
def _set_info(self):
def __repr__(self) -> str:
class Definition:
def __init__(self, info: str) -> None:
self.info: str
def __repr__(self) -> str:
class Accession:
def __init__(self, info: str) -> None:
self.info: str
def __repr__(self) -> str:
class Version:
def __init__(self, info: str) -> None:
parts: list[str]
self.version: str
self.gi: str | None
def __repr__(self) -> str:
class Origin:
def __init__(self, raw_text: str) -> None:
self._raw_text: str
@property
def sequence(self) -> str:
def __repr__(self) -> str:
class Features:
def __init__(self, info: str) -> None:
self.info: str
self.entries: list[SingleFeature]
self._parse_features()
def __repr__(self) -> str:
def _parse_features(self) -> None:
Requirements
- Python 3.10+
- pip or uv (for installation)
Installation
Regular Installation
pip install biobase
uv add biobase
Development Installation
Clone the repository and install in editable mode:
git clone https://github.com/lignum-vitae/biobase.git
cd biobase
uv pip install -e ".[dev]"
Files can be run using uv run <file_name> if in the same directory/folder
as the file.
If not using uv, to ensure that relative imports correctly work, run files using
the module path from the project root. To run the sub_matrix file, use the command
python -m src.biobase.matrix.sub_matrix
Data Files
src/biobase/matrices/: Scoring matrix data stored in JSON file format
Project Goals
Biobase aims to provide Python-friendly versions of common biological constants and tools for bioinformatics pipelines. Key objectives:
- Standardize biological data structures
- Provide efficient implementations of common scoring systems
- Ensure type safety and validation
- Maintain comprehensive documentation
- Support modern Python practices
Contributing
We welcome contributions! Please read our:
Stability
This project is in the beta stage. APIs may change without warning until version 1.0.0.
License
This project is licensed under the MIT License - see the LICENSE file for details.
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
Built Distribution
Filter files by name, interpreter, ABI, and platform.
If you're not sure about the file name format, learn more about wheel file names.
Copy a direct link to the current filters
File details
Details for the file biobase-0.9.1.tar.gz.
File metadata
- Download URL: biobase-0.9.1.tar.gz
- Upload date:
- Size: 52.1 kB
- Tags: Source
- Uploaded using Trusted Publishing? No
- Uploaded via: twine/6.2.0 CPython/3.9.25
File hashes
| Algorithm | Hash digest | |
|---|---|---|
| SHA256 |
2b8344f8be29fd6d6bbd1f50ea923a700845fd8195771d85f5b1554cf56d7eb9
|
|
| MD5 |
fbfa73af64caa38342c0a33812411b04
|
|
| BLAKE2b-256 |
fd0a680899fff94253574bac47a3c3ae583c728a7d4556b1d0a601915de9cf36
|
File details
Details for the file biobase-0.9.1-py3-none-any.whl.
File metadata
- Download URL: biobase-0.9.1-py3-none-any.whl
- Upload date:
- Size: 79.2 kB
- Tags: Python 3
- Uploaded using Trusted Publishing? No
- Uploaded via: twine/6.2.0 CPython/3.9.25
File hashes
| Algorithm | Hash digest | |
|---|---|---|
| SHA256 |
c7795b9fc0f8d8f582cb28cddd57d530162f8c676a206cd811adc4320041945c
|
|
| MD5 |
8b1b81cde91b4c01326083b09ae738fd
|
|
| BLAKE2b-256 |
e5f96e88d73b316879153f87f7815fccc8b203af7839bf73d43490e3656310ba
|