variantplaner ¶

VariantPlaner, a tool kit to manage many variants without many cpu and ram resource.

Convert a vcf in parquet, convert annotations in parquet, convert parquet in vcf.

But also build a file struct to get a fast variant database interrogations time.

Modules:

cli –

Module contains command line entry point function.
exception –

Exception could be generate by VariantPlanner.
generate –

Function to generate information.
io –

Module manage input parsing and output serializing.
normalization –

Function use to normalize data.
objects –

Module to store variantplaner object.
struct –

Generated data structures for easy integration.

Classes:

Annotations –

Object to manage lazyframe as Annotations.
ContigsLength –

Store contigs -> length information.
Genotypes –

Object to manage lazyframe as Genotypes.
Pedigree –

Object to manage lazyframe as Variants.
Variants –

Object to manage lazyframe as Variants.
Vcf –

Object to manage lazyframe as Vcf.
VcfHeader –

Object that parse and store vcf information.
VcfParsingBehavior –

Enumeration use to control behavior of IntoLazyFrame.

Annotations ¶

Annotations()

Bases: LazyFrame

Object to manage lazyframe as Annotations.

Methods:

minimal_schema –

Get minimal schema of genotypes polars.LazyFrame.

Source code in src/variantplaner/objects/annotations.py

def __init__(self):
    """Initialize a Annotations object."""
    self.lf = polars.LazyFrame(schema=Annotations.minimal_schema())

minimal_schema `classmethod` ¶

minimal_schema() -> dict[str, type]

Get minimal schema of genotypes polars.LazyFrame.

Source code in src/variantplaner/objects/annotations.py

@classmethod
def minimal_schema(cls) -> dict[str, type]:
    """Get minimal schema of genotypes polars.LazyFrame."""
    return {
        "id": polars.UInt64,
    }

ContigsLength ¶

ContigsLength()

Store contigs -> length information.

Methods:

from_path –

Fill object with file point by pathlib.Path.
from_vcf_header –

Fill a object with VcfHeader.

Source code in src/variantplaner/objects/contigs_length.py

def __init__(self):
    """Initialise a contigs length."""
    self.lf = polars.LazyFrame(
        schema={
            "contig": polars.String,
            "length": polars.UInt64,
            "offset": polars.UInt64,
        }
    )

from_path ¶

from_path(
    path: Path, /, **scan_csv_args: Unpack[ScanCsv]
) -> int

Fill object with file point by pathlib.Path.

Argument: path: path of input file

Returns: Number of contigs line view

Source code in src/variantplaner/objects/contigs_length.py

def from_path(self, path: pathlib.Path, /, **scan_csv_args: Unpack[ScanCsv]) -> int:
    """Fill object with file point by pathlib.Path.

    Argument:
    path: path of input file

    Returns: Number of contigs line view
    """
    csv = Csv()
    csv.from_path(path, **scan_csv_args)
    self.lf = csv.lf

    self.__compute_offset()

    return self.lf.collect(engine="cpu").shape[0]

from_vcf_header ¶

from_vcf_header(header: VcfHeader) -> int

Fill a object with VcfHeader.

Argument

header: VcfHeader

Returns: Number of contigs line view

Source code in src/variantplaner/objects/contigs_length.py

def from_vcf_header(self, header: VcfHeader) -> int:
    """Fill a object with VcfHeader.

    Argument:
       header: VcfHeader

    Returns: Number of contigs line view
    """
    contigs_id = re.compile(r"ID=(?P<id>[^,]+)")
    contigs_len = re.compile(r"length=(?P<length>[^,>]+)")

    count = 0
    contigs2len: dict[str, list] = {"contig": [], "length": []}
    for contig_line in header.contigs:
        if (len_match := contigs_len.search(contig_line)) and (id_match := contigs_id.search(contig_line)):
            contigs2len["contig"].append(id_match.groupdict()["id"])
            contigs2len["length"].append(int(len_match.groupdict()["length"]))
        count += 1

    self.lf = polars.LazyFrame(contigs2len, schema={"contig": polars.String, "length": polars.UInt64})

    self.__compute_offset()

    return count

Genotypes ¶

Genotypes(data: LazyFrame | None = None)

Bases: LazyFrame

Object to manage lazyframe as Genotypes.

Methods:

minimal_schema –

Get minimal schema of genotypes polars.LazyFrame.
samples_names –

Get list of sample name.

Source code in src/variantplaner/objects/genotypes.py

def __init__(self, data: polars.LazyFrame | None = None):
    """Initialize a Genotypes object."""
    if data is None:
        self.lf = polars.LazyFrame(schema=Genotypes.minimal_schema())
    else:
        self.lf = data

minimal_schema `classmethod` ¶

minimal_schema() -> dict[str, type]

Get minimal schema of genotypes polars.LazyFrame.

Source code in src/variantplaner/objects/genotypes.py

@classmethod
def minimal_schema(cls) -> dict[str, type]:
    """Get minimal schema of genotypes polars.LazyFrame."""
    return {
        "id": polars.UInt64,
        "sample": polars.String,
    }

samples_names ¶

samples_names() -> list[str]

Get list of sample name.

Source code in src/variantplaner/objects/genotypes.py

def samples_names(self) -> list[str]:
    """Get list of sample name."""
    return self.lf.select("sample").unique("sample").collect().get_column("sample").to_list()

Pedigree ¶

Pedigree()

Bases: LazyFrame

Object to manage lazyframe as Variants.

Methods:

from_path –

Read a pedigree file in polars.LazyFrame.
minimal_schema –

Get schema of variants polars.LazyFrame.
to_path –

Write pedigree polars.LazyFrame in ped format.

Source code in src/variantplaner/objects/pedigree.py

def __init__(self):
    """Initialize a Variants object."""
    self.lf = polars.LazyFrame(schema=Pedigree.minimal_schema())

from_path ¶

from_path(input_path: Path) -> None

Read a pedigree file in polars.LazyFrame.

Parameters:

input_path (Path) –

Path to pedigree file.

Returns:

None –

A polars.LazyFrame that contains ped information ('family_id', 'personal_id', 'father_id', 'mother_id', 'sex', 'affected')

Source code in src/variantplaner/objects/pedigree.py

def from_path(self, input_path: pathlib.Path) -> None:
    """Read a pedigree file in [polars.LazyFrame](https://pola-rs.github.io/polars/py-polars/html/reference/lazyframe/index.html).

    Args:
        input_path: Path to pedigree file.

    Returns:
        A [polars.LazyFrame](https://pola-rs.github.io/polars/py-polars/html/reference/lazyframe/index.html) that contains ped information ('family_id', 'personal_id', 'father_id', 'mother_id', 'sex', 'affected')
    """
    self.lf = polars.scan_csv(
        input_path,
        separator="\t",
        has_header=False,
        null_values=["None", "unknown"],
        new_columns=[
            "family_id",
            "personal_id",
            "father_id",
            "mother_id",
            "sex",
            "affected",
        ],
        schema_overrides=Pedigree.minimal_schema(),
    )

minimal_schema `classmethod` ¶

minimal_schema() -> Mapping[str, PolarsDataType]

Get schema of variants polars.LazyFrame.

Source code in src/variantplaner/objects/pedigree.py

@classmethod
def minimal_schema(
    cls,
) -> collections.abc.Mapping[str, polars._typing.PolarsDataType]:
    """Get schema of variants polars.LazyFrame."""
    return {
        "family_id": polars.String,
        "personal_id": polars.String,
        "father_id": polars.String,
        "mother_id": polars.String,
        "sex": polars.String,
        "affected": polars.Boolean,
    }

to_path ¶

to_path(output_path: Path) -> None

Write pedigree polars.LazyFrame in ped format.

Warning: This function performs polars.LazyFrame.collect before write csv, this can have a significant impact on memory usage

Parameters:

lf –

LazyFrame contains pedigree information.
output_path (Path) –

Path where write pedigree information.

Returns:

None –

None

Source code in src/variantplaner/objects/pedigree.py

def to_path(self, output_path: pathlib.Path) -> None:
    """Write pedigree [polars.LazyFrame](https://pola-rs.github.io/polars/py-polars/html/reference/lazyframe/index.html) in ped format.

    Warning: This function performs [polars.LazyFrame.collect][] before write csv, this can have a significant impact on memory usage

    Args:
        lf: LazyFrame contains pedigree information.
        output_path: Path where write pedigree information.

    Returns:
        None
    """
    self.lf.collect(engine="cpu").write_csv(output_path, include_header=False, separator="\t")

Variants ¶

Variants(data: LazyFrame | None = None)

Bases: LazyFrame

Object to manage lazyframe as Variants.

Methods:

minimal_schema –

Get schema of variants polars.LazyFrame.

Source code in src/variantplaner/objects/variants.py

def __init__(self, data: polars.LazyFrame | None = None):
    """Initialize a Variants object."""
    if data is None:
        self.lf = polars.LazyFrame(schema=Variants.minimal_schema())
    else:
        self.lf = data

minimal_schema `classmethod` ¶

minimal_schema() -> dict[str, type]

Get schema of variants polars.LazyFrame.

Source code in src/variantplaner/objects/variants.py

@classmethod
def minimal_schema(cls) -> dict[str, type]:
    """Get schema of variants polars.LazyFrame."""
    return {
        "id": polars.UInt64,
        "chr": polars.String,
        "pos": polars.UInt64,
        "ref": polars.String,
        "alt": polars.String,
    }

Vcf ¶

Vcf()

Object to manage lazyframe as Vcf.

Methods:

add_genotypes –

Add genotypes information in vcf.
annotations –

Get annotations of vcf.
from_path –

Populate Vcf object with vcf file.
genotypes –

Get genotype of vcf.
schema –

Get schema of Vcf polars.LazyFrame.
set_variants –

Set variants of vcf.
variants –

Get variants of vcf.