Source code for riverine.references

from __future__ import annotations

from math import isnan
from os import PathLike
from pathlib import Path
from typing import TYPE_CHECKING, Any, Sequence, TextIO, cast

import attrs
from typing_extensions import TypeAlias

from .components import Strand
from .locations import PlateType, WellPos, _parse_wellpos_optional
from .mixes import PlateMap
from .units import (
    DNAN,
    Q_,
    Decimal,
    DecimalQuantity,
    _parse_conc_optional,
    _parse_conc_required,
    NAN_CONC,
    nM,
    ureg,
)

if TYPE_CHECKING:  # pragma: no cover
    from pandas.core.indexing import _LocIndexer

    from .mixes import PlateMap

import numpy as np
import pandas as pd

_REF_COLUMNS = ["Name", "Plate", "Well", "Concentration (nM)", "Sequence"]
[docs] _REF_DTYPES = [object, object, object, np.float64, object]
[docs] RefFile: TypeAlias = ( "str | tuple[str, DecimalQuantity | str | dict[str, DecimalQuantity]]" )
[docs] def _new_ref_df() -> pd.DataFrame: df = pd.DataFrame(columns=_REF_COLUMNS) df["Concentration (nM)"] = df["Concentration (nM)"].astype("float") return df
@attrs.define()
[docs] class Reference:
[docs] df: pd.DataFrame = attrs.field(factory=_new_ref_df)
@property
[docs] def loc(self) -> _LocIndexer: return self.df.loc
[docs] def __getitem__(self, key: Any) -> Any: return self.df.__getitem__(key)
[docs] def __eq__(self: Reference, other: object) -> bool: if isinstance(other, Reference): return ( ((other.df == self.df) | (other.df.isna() & self.df.isna())).all().all() ) elif isinstance(other, pd.DataFrame): return ((other == self.df) | (other.isna() & self.df.isna())).all().all() return False
[docs] def __len__(self) -> int: return len(self.df)
[docs] def plate_map( self, name: str, plate_type: PlateType = PlateType.wells96, ) -> PlateMap: """ Return a :class:`PlateMap` for a given plate name in the Reference. Parameters ---------- name: Name of plate to make a :class:`PlateMap` for. plate_type: Either :data:`PlateType.wells96` or :data:`PlateType.wells384`; default is :data:`PlateType.wells96`. Returns ------- a :class:`PlateMap` consisting of all strands in this Reference object from plate named `name`. Currently always makes a 96-well plate. Raises ------ ValueError: If `name` is not the name of a plate in the reference. """ well_to_strand_name = {} found_plate_name = False available_plate_names = set() for row in self.df.itertuples(): available_plate_names.add(row.Plate) if row.Plate == name: # type: ignore found_plate_name = True well = row.Well # type: ignore sequence = row.Sequence # type: ignore strand = Strand(name=row.Name, sequence=sequence) # type: ignore well_to_strand_name[well] = strand.name if not found_plate_name: raise ValueError(f'Plate "{name}" not found in reference file.' f'\nAvailable plate names: {", ".join(available_plate_names)}') plate_map = PlateMap( plate_name=name, plate_type=plate_type, well_to_strand_name=well_to_strand_name, ) return plate_map
[docs] def search( self, name: str | None = None, plate: str | None = None, well: str | WellPos | None = None, concentration: str | DecimalQuantity | None = None, sequence: str | None = None, ) -> Reference: well = _parse_wellpos_optional(well) concentration = _parse_conc_optional(concentration) cdf = self.df if name is not None: cdf = cdf.loc[cdf["Name"] == name, :] if plate is not None: cdf = cdf.loc[cdf["Plate"] == plate, :] if well is not None: cdf = cdf.loc[cdf["Well"] == str(well), :] if not isnan(concentration.m): conc = concentration.m_as("nM") cdf = cdf.loc[cdf["Concentration (nM)"] == conc, :] if sequence is not None: cdf = cdf.loc[cdf["Sequence"] == sequence, :] return Reference(cdf)
[docs] def get_concentration( self, name: str | None = None, plate: str | None = None, well: str | WellPos | None = None, concentration: str | DecimalQuantity | None = None, sequence: str | None = None, ) -> DecimalQuantity: valref = self.search(name, plate, well, concentration, sequence) if len(valref) == 1: return Q_(valref.df["Concentration (nM)"].iloc[0], nM) elif len(valref) > 1: raise ValueError( f"Found multiple possible components: {valref!s}", valref ) raise ValueError("Did not find any matching components.")
@classmethod
[docs] def from_csv(cls, filename_or_file: str | TextIO | PathLike[str]) -> Reference: """ Load reference information from a CSV file. The reference information loaded by this function should be compiled manually, fitting the :ref:`mix reference` format, or be loaded with :func:`compile_reference` or :func:`update_reference`. """ df = pd.read_csv(filename_or_file, converters={"Concentration (nM)": Decimal}) df = df.reindex( ["Name", "Plate", "Well", "Concentration (nM)", "Sequence"], axis="columns" ) return cls(df)
[docs] def to_csv(self, filename: str | PathLike[str]) -> None: self.df.to_csv(filename, index=None, float_format="%.6f")
[docs] def update( self: Reference, files: Sequence[RefFile] | RefFile, round: int = -1 ) -> Reference: """ Update reference information. This updates an existing reference dataframe with new files, with the same methods as :func:`compile_reference`. """ if isinstance(files, str) or ( len(files) == 2 and isinstance(files[1], str) and not Path(files[1]).exists() ): files_list: Sequence[RefFile] = [cast(RefFile, files)] else: files_list = cast(Sequence[RefFile], files) # FIXME: how to deal with repeats? for filename in files_list: filetype = None all_conc = None conc_dict: dict[str, DecimalQuantity] = {} if isinstance(filename, tuple): conc_info = filename[1] filepath = Path(filename[0]) if isinstance(conc_info, dict): conc_dict = { k: _parse_conc_required(v) for k, v in cast( dict[str, DecimalQuantity], conc_info ).items() } if "default" in conc_dict: all_conc = _parse_conc_required(conc_dict["default"]) del conc_dict["default"] else: all_conc = _parse_conc_required(conc_info) else: filepath = Path(filename) if filepath.suffix in (".xls", ".xlsx"): data: dict[str, pd.DataFrame] = pd.read_excel(filepath, sheet_name=None) if "Plate Specs" in data: if len(data) > 1: raise ValueError( f"Plate specs file {filepath} should only have one sheet, but has {len(data)}." ) sheet: pd.DataFrame = data["Plate Specs"] filetype = "plate-specs" sheet.rename(lambda x: x.lower(), inplace=True, axis="columns") sheet.loc[:, "Concentration (nM)"] = 1000 * sheet.loc[ :, "measured concentration µm " ].round(round) sheet.loc[:, "Sequence"] = [ x.replace(" ", "") for x in sheet.loc[:, "sequence"] ] sheet.loc[:, "Well"] = [ str(WellPos(x)) for x in sheet.loc[:, "well position"] ] sheet.rename( { "plate name": "Plate", "sequence name": "Name", }, axis="columns", inplace=True, ) self.df = pd.concat( (self.df, sheet.loc[:, _REF_COLUMNS]), ignore_index=True ) continue else: # FIXME: need better check here # if not all( # next(iter(data.values())).columns # == ["Well Position", "Name", "Sequence"] # ): # raise ValueError filetype = "plates-order" for k, v in data.items(): if "Plate" in v.columns: # There's already a plate column. That's problematic. Let's check, # then delete it. if not all(v["Plate"] == k): raise ValueError( "Not all rows in sheet {k} have same plate value (normal IDT order files do not have a plate column)." ) del v["Plate"] v["Concentration (nM)"] = conc_dict.get( k, all_conc if all_conc is not None else NAN_CONC ).m_as(nM) all_seqs = ( pd.concat( data.values(), keys=data.keys(), names=["Plate"] ) .reset_index() .drop(columns=["level_1"]) ) all_seqs.rename( {"Well Position": "Well", "Well position": "Well"}, axis="columns", inplace=True, ) all_seqs.loc[:, "Well"] = all_seqs.loc[:, "Well"].map( lambda x: str(WellPos(x)) ) self.df = pd.concat((self.df, all_seqs), ignore_index=True) continue if filepath.suffix == ".csv": # Are we a COA file? If so, it isn't valid Unicode... # We'll check initially in binary mode. with filepath.open("rb") as f: testbin = f.read(25) if testbin == b'"Sales Order","Reference"': # We're a COA file... in case IDT fixes things, we'll try UTF-8 try: df = pd.read_csv(filepath) except UnicodeDecodeError: df = pd.read_csv(filepath, encoding="iso8859-1") self.df = pd.concat( (self.df, _parse_idt_coa(df)), ignore_index=True ) continue else: tubedata = pd.read_csv(filepath) filetype = "idt-bulk" if filepath.suffix == ".txt": tubedata = pd.read_table(filepath) filetype = "idt-bulk" if filetype == "idt-bulk": tubedata["Plate"] = "tube" tubedata["Well"] = None tubedata["Concentration (nM)"] = ( all_conc.m_as(nM) if all_conc is not None else DNAN ) self.df = pd.concat( (self.df, tubedata.loc[:, _REF_COLUMNS]), ignore_index=True ) continue raise NotImplementedError # FIXME: validation return self
@classmethod
[docs] def compile(cls, files: Sequence[RefFile] | RefFile, round: int = -1) -> Reference: """ Compile reference information. This loads information from the following sources: - An IDT plate order spreadsheet. This does not include concentration. To add concentration information, list it as a tuple of :code:`(file, concentration)`. - An IDT bulk order entry text file. - An IDT plate spec sheet. """ return cls().update(files, round=round)
[docs] _REF_COLUMNS = ["Name", "Plate", "Well", "Concentration (nM)", "Sequence"]
[docs] def _parse_idt_coa(df: pd.DataFrame) -> pd.DataFrame: df.rename({"Sequence Name": "Name"}, axis="columns", inplace=True) df.loc[:, "Well"] = df.loc[:, "Well Position"].map(lambda x: str(WellPos(x))) df.loc[:, "Concentration (nM)"] = df.loc[:, "Conc"].map( lambda x: ureg.Quantity(x).m_as(nM) ) df.loc[:, "Plate"] = None df.loc[:, "Sequence"] = df.loc[:, "Sequence"].str.replace(" ", "") return df.loc[:, _REF_COLUMNS]
[docs] def load_reference(filename_or_file: str | TextIO) -> Reference: return Reference.from_csv(filename_or_file)