Skip to main content

A comprehensive package of biological constants, serving as a foundational resource for biology and bioinformatics, complemented by functions to streamline related tasks.

Project description

Biobase

Static Badge Python Version from PEP 621 TOML PyPI version License: MIT GitHub branch check runs

A Python package providing standardized biological constants and substitution matrices for bioinformatics pipelines. Biobase aims to eliminate the need to repeatedly recreate common biological data structures and scoring systems in your code.

Table of Contents

Quick Start

Access amino acid properties

from biobase.constants import ONE_LETTER_CODES, MONO_MASS, CODON_TABLE
print(ONE_LETTER_CODES)      # 'ACDEFGHIKLMNPQRSTVWY'
print(MONO_MASS['A'])        # 71.037113805
print(CODON_TABLE["AUG"])    # M

Use substitution matrices

from biobase.matrix import Blosum
blosum62 = Blosum(62)
print(blosum62['A']['A'])  # 4
print(blosum62['W']['C'])  # -2
from biobase.matrix import Pam
pam200 = Pam(200)
print(pam200['A']['A'])  # 3
print(pam200['W']['C'])  # -9
from biobase.matrix import Identity
identity0 = Identity(0)
print(identity0['A']['A'])  # 1
print(identity0['W']['C'])  # 0
from biobase.matrix import Match
match_mat = Match()
print(match_mat['A']['A'])  # 1
print(match_mat['W']['C'])  # -1

Analyse DNA sequences

from biobase.analysis import Dna
sequence = "ATCGTAGC"
print(Dna.complement(sequence))               # 'TAGCATCG'
print(Dna.complement(sequence, reverse=True)) # 'GCTACGAT'
print(Dna.transcribe(sequence))               # 'AUCGUAGC'
print(Dna.translate(sequence))                # 'IV'
print(Dna.calculate_gc_content(sequence))     # 50.0
print(Dna.calculate_at_content(sequence))     # 50.0
print(Dna.entropy(sequence))                  # 2.0

seq = "ccatgccctaaatggggtag"
for start, end, orf in Dna.find_orfs(seq, include_seq=True)
    print(start, end, orf)
# 2, 11, "ATGCCCTAA"
# 11, 20, "ATGGGGTAG"

Analyse Nucleotides

from biobase.analysis import Nucleotides

print(Nucleotides.molecular_weight("A"))               # 135.13
print(Nucleotides.cumulative_molecular_weight("ATCG")) # 523.48
print(Nucleotides.translate("AUGUUGUCGCCUU"))          # 'MLSP'

Find protein motifs

from biobase.analysis import find_motifs
sequence = "ACDEFGHIKLMNPQRSTVWY"
print(find_motifs(sequence, "DEF"))
# [(1, 4)]

test_dict = {
    ">SP001": "ACDEFCDEFCDEFGHIKLMN",  # has matches for "CDE" that span indexes [(1, 4), (5, 8), (9, 12)]
    ">SP002": "MNPQRSTVWYACDEFGHIKL",  # has match for "CDE" that span indexes [(11, 14)]
    ">SP003": "AAAAAAAAAAAAAAAAAA12",  # invalid: contains "1", "2"
    ">SP004": "GGGGGGGGGGGGGGGGGGGG",  # no match
    ">SP005": "HHHHHHHHHHHHHHHHH@#$",  # invalid: contains "@", "#", "$"
    ">SP006": "DDDDDDDDDDDDDDDDDDDD",  # no match
    ">SP007": "CDEFGHCDEFKLCDEFPQRS",  # has matches for "CDE" that span indexes [(0, 3), (6, 9), (12, 15)]
    ">SP008": "LLLLLLLLLLLLLLLLLLLL",  # no match
    ">SP009": "KKKKKKKKKKKK123KKKKK",  # invalid: contains "1", "2", "3"
    ">SP010": "CDEACDEDCDEFAAAAAAAA",  # has matches for "CDE" that span indexes [(0, 3), (4, 7), (8, 11)]
}
matched, invalid, non_match = find_motifs(test_dict, "CDE")
print("Matches:")
for seq, matches in matched.items():
    print(f"{seq}")
    print(f"{"".join([f"{match[0]} to {match[1]}\n" for match in matches])}")
print(f"Invalid sequences:\n{"".join([f"{seq}: {invs}\n" for seq, invs in invalid.items()])}")
print(f"Sequences without matches:\n{"".join([f"- {nm}\n" for nm in non_match])}")

# Matches:
# >SP001
# 1 to 4
# 5 to 8
# 9 to 12

# >SP002
# 11 to 14

# >SP007
# 0 to 3
# 6 to 9
# 12 to 15

# >SP010
# 0 to 3
# 4 to 7
# 8 to 11

# Invalid sequences:
# >SP003: {'2', '1'}
# >SP005: {'$', '@', '#'}
# >SP009: {'2', '1', '3'}

# Sequences without matches:
# - >SP004
# - >SP006
# - >SP008

Parse FASTA

Each entry is stored as a record

class FastaRecord:
    def __init__(self, header, sequence) -> None:
        self.id
        self.name
        self.seq
    def __repr__(self) -> str:
    def __str__(self) -> str:
    def length(self) -> int:

Class and function to read fasta string

from biobase.parser import FastaParser, fasta_parser
fasta = """>CAA39742.1 cytochrome b (mitochondrion) [Sus scrofa]
MTNIRKSHPLMKIINNAFIDLPAPSNISSWWNFGSLLGICLILQILTGLFLAMHYTSDTTTAFSSVTHIC"""

# Class that yields generator
records = list(FastaParser(fasta))
# File parsing done with FastaFileParser(fasta_file_path)
r: FastaRecord = records[0]
print(r.id) # CAA39742.1
print(r.seq) # MTNIRKSHPLMKIINNAFIDLPAPSNISSWWNFGSLLGICLILQILTGLFLAMHYTSDTTTAFSSVTHIC

# Function that returns list
records = fasta_parser(fasta)
# File parsing done with fasta_file_parser(fasta_file_path)
for r in records:
    print(r.id) # CAA39742.1
    print(r.seq) # MTNIRKSHPLMKIINNAFIDLPAPSNISSWWNFGSLLGICLILQILTGLFLAMHYTSDTTTAFSSVTHIC

Parse FASTQ

Each entry is stored as a record

class FastqRecord:
    def __init__(self, id: str, seq: str, separator: str, quality: str) -> None:
        self.id: str
        self.seq: str
        self.separator: str
        self.quality: str
    def __repr__(self) -> str:
    def __str__(self) -> str:
    def length(self) -> int:
    def convert_to_fasta(self) -> str:
    def phred_scores(self) -> np.ndarray:
    def average_quality(self) -> float:
from biobase.parser import FastqParser, fastq_parser
fastq = """@2fa9ee19-5c51-4281-abdd-eac86
CGGTAGCCAGCTGCGTTCAGTATG
+
%%%+++'''@@@???<<<??????"""

# Class that yields generator
records = list(FastqParser(fastq))
# File reading done with FastqFileParser(fastq_file_path)
r: FastqRecord = records[0]
print(r.id) # 2fa9ee19-5c51-4281-abdd-eac86
print(r.seq) # CGGTAGCCAGCTGCGTTCAGTATG

# Function that returns list
records = fastq_parser(fastq)
# File reading done with fastq_file_parser(fastq_file_path)
for r in records:
    print(r.id) # 2fa9ee19-5c51-4281-abdd-eac86
    print(r.seq) # CGGTAGCCAGCTGCGTTCAGTATG
class FastqFileParser(FastqParserBase):
    def __init__(self, filepath: str) -> None:
        self.filepath = filepath
    def __iter__(self) -> Iterator[FastqRecord]:
    def count_reads(self) -> int:
    def filter_reads(self, min_avg_quality: float) -> Iterator[FastqRecord]:
    def to_fasta(self) -> list[FastaRecord]:
    def to_fasta_iter(self) -> Iterator[FastaRecord]:
    def to_fasta_file(self, out_path: str) -> None:
    def read_lengths(self) -> np.ndarray:

class FastqParser(FastqParserBase):
    def __init__(self, reads: str) -> None:
        self.reads = reads
    def __iter__(self) -> Iterator[FastqRecord]:
    def count_reads(self) -> int:
    def filter_reads(self, min_avg_quality: float) -> Iterator[FastqRecord]:
    def to_fasta(self) -> list[FastaRecord]:
    def to_fasta_iter(self) -> Iterator[FastaRecord]:
    def to_fasta_file(self, out_path: str) -> None:
    def read_lengths(self) -> np.ndarray:

Parse Genbank

Each entry is parsed as a record

class GenBankRecord:
    """Represents a parsed GenBank record with entries"""

    _entry_classes: dict[str, type] = {
        "LOCUS": Locus,
        "DEFINITION": Definition,
        "ACCESSION": Accession,
        "FEATURES": Features,
        "ORIGIN": Origin,
        "VERSION": Version,
    }

    def __init__(
        self, entries: dict[str, Any], source_filepath: Path | None = None
    ) -> None:
        self.id: str
        self.seq: str
        self.name: str
        self.entries: dict[str, Any] # Dict of entry classes
        self._source_filepath
    def __repr__(self) -> str:
from biobase.parser import GenBankFileParser

"""
# GENBANK FILE CONTENTS

LOCUS       ADF90000            50 bp    DNA     circular INV 01-JAN-2023
DEFINITION  A test record.
ACCESSION   ADF90000
VERSION     ADF90000.1  GI:100000000
KEYWORDS    second; test.
ORIGIN
        1 cgatcggatc gattcggact ggatcgatcg atcggatcga tcggatcgga
//
"""

parser = GenBankFileParser(path_to_file)
records = list(parser)

r = records[0]
print(r.id)  # ADF90000
print(r.seq) # cgatcggatcgattcggactggatcgatcgatcggatcgatcggatcgga

version = r.entries["VERSION"]
print(version.version) # ADF90000.1

# Entries

class Locus:
    _MOLECULE_TYPE_LIST: list[str] = ["DNA", "RNA", "PROTEIN"]
    def __init__(self, line: str) -> None:
        self._raw_line: str
        self._parts: list[str]
        self.name: str
        self.length: int
        self.molecule_type: str
        self.topology: str
        self.date: str
        self._set_info()
    def _set_info(self):
    def __repr__(self) -> str:

class Definition:
    def __init__(self, info: str) -> None:
        self.info: str
    def __repr__(self) -> str:

class Accession:
    def __init__(self, info: str) -> None:
        self.info: str
    def __repr__(self) -> str:

class Version:
    def __init__(self, info: str) -> None:
        parts: list[str]
        self.version: str
        self.gi: str | None
    def __repr__(self) -> str:

class Origin:
    def __init__(self, raw_text: str) -> None:
        self._raw_text: str
    @property
    def sequence(self) -> str:
    def __repr__(self) -> str:

class Features:
    def __init__(self, info: str) -> None:
        self.info: str
        self.entries: list[SingleFeature]
        self._parse_features()
    def __repr__(self) -> str:
    def _parse_features(self) -> None:

Requirements

  • Python 3.10+
  • pip or uv (for installation)

Installation

Regular Installation

pip install biobase

uv add biobase

Development Installation

Clone the repository and install in editable mode:

git clone https://github.com/lignum-vitae/biobase.git
cd biobase
uv pip install -e ".[dev]"

Files can be run using uv run <file_name> if in the same directory/folder as the file.

If not using uv, to ensure that relative imports correctly work, run files using the module path from the project root. To run the sub_matrix file, use the command python -m src.biobase.matrix.sub_matrix

Data Files

  • src/biobase/matrices/: Scoring matrix data stored in JSON file format

Project Goals

Biobase aims to provide Python-friendly versions of common biological constants and tools for bioinformatics pipelines. Key objectives:

  1. Standardize biological data structures
  2. Provide efficient implementations of common scoring systems
  3. Ensure type safety and validation
  4. Maintain comprehensive documentation
  5. Support modern Python practices

Contributing

We welcome contributions! Please read our:

Stability

This project is in the beta stage. APIs may change without warning until version 1.0.0.

License

This project is licensed under the MIT License - see the LICENSE file for details.

Project details


Download files

Download the file for your platform. If you're not sure which to choose, learn more about installing packages.

Source Distribution

biobase-0.9.1.tar.gz (52.1 kB view details)

Uploaded Source

Built Distribution

If you're not sure about the file name format, learn more about wheel file names.

biobase-0.9.1-py3-none-any.whl (79.2 kB view details)

Uploaded Python 3

File details

Details for the file biobase-0.9.1.tar.gz.

File metadata

  • Download URL: biobase-0.9.1.tar.gz
  • Upload date:
  • Size: 52.1 kB
  • Tags: Source
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/6.2.0 CPython/3.9.25

File hashes

Hashes for biobase-0.9.1.tar.gz
Algorithm Hash digest
SHA256 2b8344f8be29fd6d6bbd1f50ea923a700845fd8195771d85f5b1554cf56d7eb9
MD5 fbfa73af64caa38342c0a33812411b04
BLAKE2b-256 fd0a680899fff94253574bac47a3c3ae583c728a7d4556b1d0a601915de9cf36

See more details on using hashes here.

File details

Details for the file biobase-0.9.1-py3-none-any.whl.

File metadata

  • Download URL: biobase-0.9.1-py3-none-any.whl
  • Upload date:
  • Size: 79.2 kB
  • Tags: Python 3
  • Uploaded using Trusted Publishing? No
  • Uploaded via: twine/6.2.0 CPython/3.9.25

File hashes

Hashes for biobase-0.9.1-py3-none-any.whl
Algorithm Hash digest
SHA256 c7795b9fc0f8d8f582cb28cddd57d530162f8c676a206cd811adc4320041945c
MD5 8b1b81cde91b4c01326083b09ae738fd
BLAKE2b-256 e5f96e88d73b316879153f87f7815fccc8b203af7839bf73d43490e3656310ba

See more details on using hashes here.

Supported by

AWS Cloud computing and Security Sponsor Datadog Monitoring Depot Continuous Integration Fastly CDN Google Download Analytics Pingdom Monitoring Sentry Error logging StatusPage Status page