Source code for pod5.tools.pod5_filter

"""
Tool for subsetting pod5 files into one or more outputs using a list of read ids
"""


from pathlib import Path
from typing import List, Set
from uuid import UUID

from pod5.tools.parsers import prepare_pod5_filter_argparser, run_tool
from pod5.tools.pod5_subset import calculate_transfers, launch_subsetting


[docs]def parse_ids(ids_path: Path) -> Set[str]: """Parse the list of read_ids checking all are valid uuids""" read_ids = set([]) with ids_path.open("r") as _fh: for line_no, raw in enumerate(_fh.readlines()): line = raw.strip() if not line or line.startswith("#") or line == "read_id": continue try: # Check that all lines are valid uuids UUID(line) read_ids.add(line) except ValueError as exc: raise RuntimeError( f'Invalid UUID read_id on line {line_no} - "{raw}"' ) from exc return read_ids
[docs]def filter_pod5( inputs: List[Path], output: Path, ids: Path, missing_ok: bool, duplicate_ok: bool, force_overwrite: bool, ) -> None: """Prepare the pod5 filter mapping and run the repacker""" # Remove output file if output.exists(): if not force_overwrite: raise FileExistsError( f"Output file already exists and --force_overwrite not set - {output}" ) else: output.unlink() # Create parent directories if they do not exist if not output.parent.exists(): output.parent.mkdir(parents=True, exist_ok=True) read_ids = parse_ids(ids) if len(read_ids) == 0: raise AssertionError("Selected 0 read_ids. Nothing to do") resolved_targets = {_id: set([output]) for _id in read_ids} # Map the target outputs to which source read ids they're comprised of transfers = calculate_transfers( inputs=list(inputs), read_targets=resolved_targets, missing_ok=missing_ok, duplicate_ok=duplicate_ok, ) print("Subsetting please wait...") launch_subsetting(transfers=transfers) print("Done") return
[docs]def main(): """pod5 filter main""" run_tool(prepare_pod5_filter_argparser())
if __name__ == "__main__": main()