Files
Buffteks-Website/streamlit-venv/lib/python3.10/site-packages/narwhals/functions.py
2025-01-10 21:40:35 +00:00

608 lines
19 KiB
Python
Executable File

from __future__ import annotations
import platform
import sys
from typing import TYPE_CHECKING
from typing import Any
from typing import Iterable
from typing import Literal
from typing import Protocol
from typing import TypeVar
from typing import Union
from narwhals.dataframe import DataFrame
from narwhals.dataframe import LazyFrame
from narwhals.translate import from_native
from narwhals.utils import Implementation
from narwhals.utils import parse_version
from narwhals.utils import validate_laziness
# Missing type parameters for generic type "DataFrame"
# However, trying to provide one results in mypy still complaining...
# The rest of the annotations seem to work fine with this anyway
FrameT = TypeVar("FrameT", bound=Union[DataFrame, LazyFrame]) # type: ignore[type-arg]
if TYPE_CHECKING:
from types import ModuleType
from narwhals.dtypes import DType
from narwhals.schema import Schema
from narwhals.series import Series
from narwhals.typing import DTypes
class ArrowStreamExportable(Protocol):
def __arrow_c_stream__(
self, requested_schema: object | None = None
) -> object: ...
def concat(
items: Iterable[FrameT],
*,
how: Literal["horizontal", "vertical"] = "vertical",
) -> FrameT:
"""
Concatenate multiple DataFrames, LazyFrames into a single entity.
Arguments:
items: DataFrames, LazyFrames to concatenate.
how: {'vertical', 'horizontal'}
* vertical: Stacks Series from DataFrames vertically and fills with `null`
if the lengths don't match.
* horizontal: Stacks Series from DataFrames horizontally and fills with `null`
if the lengths don't match.
Returns:
A new DataFrame, Lazyframe resulting from the concatenation.
Raises:
NotImplementedError: The items to concatenate should either all be eager, or all lazy
Examples:
Let's take an example of vertical concatenation:
>>> import pandas as pd
>>> import polars as pl
>>> import narwhals as nw
>>> data_1 = {"a": [1, 2, 3], "b": [4, 5, 6]}
>>> data_2 = {"a": [5, 2], "b": [1, 4]}
>>> df_pd_1 = pd.DataFrame(data_1)
>>> df_pd_2 = pd.DataFrame(data_2)
>>> df_pl_1 = pl.DataFrame(data_1)
>>> df_pl_2 = pl.DataFrame(data_2)
Let's define a dataframe-agnostic function:
>>> @nw.narwhalify
... def func(df1, df2):
... return nw.concat([df1, df2], how="vertical")
>>> func(df_pd_1, df_pd_2)
a b
0 1 4
1 2 5
2 3 6
0 5 1
1 2 4
>>> func(df_pl_1, df_pl_2)
shape: (5, 2)
┌─────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1 ┆ 4 │
│ 2 ┆ 5 │
│ 3 ┆ 6 │
│ 5 ┆ 1 │
│ 2 ┆ 4 │
└─────┴─────┘
Let's look at case a for horizontal concatenation:
>>> import pandas as pd
>>> import polars as pl
>>> import narwhals as nw
>>> data_1 = {"a": [1, 2, 3], "b": [4, 5, 6]}
>>> data_2 = {"c": [5, 2], "d": [1, 4]}
>>> df_pd_1 = pd.DataFrame(data_1)
>>> df_pd_2 = pd.DataFrame(data_2)
>>> df_pl_1 = pl.DataFrame(data_1)
>>> df_pl_2 = pl.DataFrame(data_2)
Defining a dataframe-agnostic function:
>>> @nw.narwhalify
... def func(df1, df2):
... return nw.concat([df1, df2], how="horizontal")
>>> func(df_pd_1, df_pd_2)
a b c d
0 1 4 5.0 1.0
1 2 5 2.0 4.0
2 3 6 NaN NaN
>>> func(df_pl_1, df_pl_2)
shape: (3, 4)
┌─────┬─────┬──────┬──────┐
│ a ┆ b ┆ c ┆ d │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪══════╪══════╡
│ 1 ┆ 4 ┆ 5 ┆ 1 │
│ 2 ┆ 5 ┆ 2 ┆ 4 │
│ 3 ┆ 6 ┆ null ┆ null │
└─────┴─────┴──────┴──────┘
"""
if how not in ("horizontal", "vertical"): # pragma: no cover
msg = "Only horizontal and vertical concatenations are supported"
raise NotImplementedError(msg)
if not items:
msg = "No items to concatenate"
raise ValueError(msg)
items = list(items)
validate_laziness(items)
first_item = items[0]
plx = first_item.__narwhals_namespace__()
return first_item._from_compliant_dataframe( # type: ignore[return-value]
plx.concat([df._compliant_frame for df in items], how=how),
)
def new_series(
name: str,
values: Any,
dtype: DType | type[DType] | None = None,
*,
native_namespace: ModuleType,
) -> Series:
"""
Instantiate Narwhals Series from raw data.
Arguments:
name: Name of resulting Series.
values: Values of make Series from.
dtype: (Narwhals) dtype. If not provided, the native library
may auto-infer it from `values`.
native_namespace: The native library to use for DataFrame creation.
Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import narwhals as nw
>>> data = {"a": [1, 2, 3], "b": [4, 5, 6]}
Let's define a dataframe-agnostic function:
>>> @nw.narwhalify
... def func(df):
... values = [4, 1, 2]
... native_namespace = nw.get_native_namespace(df)
... return nw.new_series("c", values, nw.Int32, native_namespace=native_namespace)
Let's see what happens when passing pandas / Polars input:
>>> func(pd.DataFrame(data))
0 4
1 1
2 2
Name: c, dtype: int32
>>> func(pl.DataFrame(data)) # doctest: +NORMALIZE_WHITESPACE
shape: (3,)
Series: 'c' [i32]
[
4
1
2
]
"""
from narwhals import dtypes
return _new_series_impl(
name,
values,
dtype,
native_namespace=native_namespace,
dtypes=dtypes, # type: ignore[arg-type]
)
def _new_series_impl(
name: str,
values: Any,
dtype: DType | type[DType] | None = None,
*,
native_namespace: ModuleType,
dtypes: DTypes,
) -> Series:
implementation = Implementation.from_native_namespace(native_namespace)
if implementation is Implementation.POLARS:
if dtype:
from narwhals._polars.utils import (
narwhals_to_native_dtype as polars_narwhals_to_native_dtype,
)
dtype = polars_narwhals_to_native_dtype(dtype, dtypes=dtypes)
native_series = native_namespace.Series(name=name, values=values, dtype=dtype)
elif implementation in {
Implementation.PANDAS,
Implementation.MODIN,
Implementation.CUDF,
}:
if dtype:
from narwhals._pandas_like.utils import (
narwhals_to_native_dtype as pandas_like_narwhals_to_native_dtype,
)
backend_version = parse_version(native_namespace.__version__)
dtype = pandas_like_narwhals_to_native_dtype(
dtype, None, implementation, backend_version, dtypes
)
native_series = native_namespace.Series(values, name=name, dtype=dtype)
elif implementation is Implementation.PYARROW:
if dtype:
from narwhals._arrow.utils import (
narwhals_to_native_dtype as arrow_narwhals_to_native_dtype,
)
dtype = arrow_narwhals_to_native_dtype(dtype, dtypes=dtypes)
native_series = native_namespace.chunked_array([values], type=dtype)
elif implementation is Implementation.DASK:
msg = "Dask support in Narwhals is lazy-only, so `new_series` is " "not supported"
raise NotImplementedError(msg)
else: # pragma: no cover
try:
# implementation is UNKNOWN, Narwhals extension using this feature should
# implement `from_dict` function in the top-level namespace.
native_series = native_namespace.new_series(name, values, dtype)
except AttributeError as e:
msg = "Unknown namespace is expected to implement `Series` constructor."
raise AttributeError(msg) from e
return from_native(native_series, series_only=True).alias(name)
def from_dict(
data: dict[str, Any],
schema: dict[str, DType] | Schema | None = None,
*,
native_namespace: ModuleType | None = None,
) -> DataFrame[Any]:
"""
Instantiate DataFrame from dictionary.
Notes:
For pandas-like dataframes, conversion to schema is applied after dataframe
creation.
Arguments:
data: Dictionary to create DataFrame from.
schema: The DataFrame schema as Schema or dict of {name: type}.
native_namespace: The native library to use for DataFrame creation. Only
necessary if inputs are not Narwhals Series.
Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import pyarrow as pa
>>> import narwhals as nw
>>> data = {"a": [1, 2, 3], "b": [4, 5, 6]}
Let's create a new dataframe of the same class as the dataframe we started with, from a dict of new data:
>>> @nw.narwhalify
... def func(df):
... new_data = {"c": [5, 2], "d": [1, 4]}
... native_namespace = nw.get_native_namespace(df)
... return nw.from_dict(new_data, native_namespace=native_namespace)
Let's see what happens when passing Pandas, Polars or PyArrow input:
>>> func(pd.DataFrame(data))
c d
0 5 1
1 2 4
>>> func(pl.DataFrame(data))
shape: (2, 2)
┌─────┬─────┐
│ c ┆ d │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 5 ┆ 1 │
│ 2 ┆ 4 │
└─────┴─────┘
>>> func(pa.table(data))
pyarrow.Table
c: int64
d: int64
----
c: [[5,2]]
d: [[1,4]]
"""
from narwhals import dtypes
return _from_dict_impl(
data,
schema,
native_namespace=native_namespace,
dtypes=dtypes, # type: ignore[arg-type]
)
def _from_dict_impl(
data: dict[str, Any],
schema: dict[str, DType] | Schema | None = None,
*,
native_namespace: ModuleType | None = None,
dtypes: DTypes,
) -> DataFrame[Any]:
from narwhals.series import Series
from narwhals.translate import to_native
if not data:
msg = "from_dict cannot be called with empty dictionary"
raise ValueError(msg)
if native_namespace is None:
for val in data.values():
if isinstance(val, Series):
native_namespace = val.__native_namespace__()
break
else:
msg = "Calling `from_dict` without `native_namespace` is only supported if all input values are already Narwhals Series"
raise TypeError(msg)
data = {key: to_native(value, strict=False) for key, value in data.items()}
implementation = Implementation.from_native_namespace(native_namespace)
if implementation is Implementation.POLARS:
if schema:
from narwhals._polars.utils import (
narwhals_to_native_dtype as polars_narwhals_to_native_dtype,
)
schema = {
name: polars_narwhals_to_native_dtype(dtype, dtypes=dtypes)
for name, dtype in schema.items()
}
native_frame = native_namespace.from_dict(data, schema=schema)
elif implementation in {
Implementation.PANDAS,
Implementation.MODIN,
Implementation.CUDF,
}:
native_frame = native_namespace.DataFrame.from_dict(data)
if schema:
from narwhals._pandas_like.utils import (
narwhals_to_native_dtype as pandas_like_narwhals_to_native_dtype,
)
backend_version = parse_version(native_namespace.__version__)
schema = {
name: pandas_like_narwhals_to_native_dtype(
schema[name], native_type, implementation, backend_version, dtypes
)
for name, native_type in native_frame.dtypes.items()
}
native_frame = native_frame.astype(schema)
elif implementation is Implementation.PYARROW:
if schema:
from narwhals._arrow.utils import (
narwhals_to_native_dtype as arrow_narwhals_to_native_dtype,
)
schema = native_namespace.schema(
[
(name, arrow_narwhals_to_native_dtype(dtype, dtypes))
for name, dtype in schema.items()
]
)
native_frame = native_namespace.table(data, schema=schema)
else: # pragma: no cover
try:
# implementation is UNKNOWN, Narwhals extension using this feature should
# implement `from_dict` function in the top-level namespace.
native_frame = native_namespace.from_dict(data)
except AttributeError as e:
msg = "Unknown namespace is expected to implement `from_dict` function."
raise AttributeError(msg) from e
return from_native(native_frame, eager_only=True)
def from_arrow(
native_frame: ArrowStreamExportable, *, native_namespace: ModuleType
) -> DataFrame[Any]:
"""
Construct a DataFrame from an object which supports the PyCapsule Interface.
Arguments:
native_frame: Object which implements `__arrow_c_stream__`.
native_namespace: The native library to use for DataFrame creation.
Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import pyarrow as pa
>>> import narwhals as nw
>>> data = {"a": [1, 2, 3], "b": [4, 5, 6]}
Let's define a dataframe-agnostic function which creates a PyArrow
Table.
>>> @nw.narwhalify
... def func(df):
... return nw.from_arrow(df, native_namespace=pa)
Let's see what happens when passing pandas / Polars input:
>>> func(pd.DataFrame(data)) # doctest: +SKIP
pyarrow.Table
a: int64
b: int64
----
a: [[1,2,3]]
b: [[4,5,6]]
>>> func(pl.DataFrame(data)) # doctest: +SKIP
pyarrow.Table
a: int64
b: int64
----
a: [[1,2,3]]
b: [[4,5,6]]
"""
if not hasattr(native_frame, "__arrow_c_stream__"):
msg = f"Given object of type {type(native_frame)} does not support PyCapsule interface"
raise TypeError(msg)
implementation = Implementation.from_native_namespace(native_namespace)
if implementation is Implementation.POLARS and parse_version(
native_namespace.__version__
) >= (1, 3):
native_frame = native_namespace.DataFrame(native_frame)
elif implementation in {
Implementation.PANDAS,
Implementation.MODIN,
Implementation.CUDF,
Implementation.POLARS,
}:
# These don't (yet?) support the PyCapsule Interface for import
# so we go via PyArrow
try:
import pyarrow as pa # ignore-banned-import
except ModuleNotFoundError as exc: # pragma: no cover
msg = f"PyArrow>=14.0.0 is required for `from_arrow` for object of type {native_namespace}"
raise ModuleNotFoundError(msg) from exc
if parse_version(pa.__version__) < (14, 0): # pragma: no cover
msg = f"PyArrow>=14.0.0 is required for `from_arrow` for object of type {native_namespace}"
raise ModuleNotFoundError(msg) from None
tbl = pa.table(native_frame)
if implementation is Implementation.PANDAS:
native_frame = tbl.to_pandas()
elif implementation is Implementation.MODIN: # pragma: no cover
from modin.pandas.utils import from_arrow
native_frame = from_arrow(tbl)
elif implementation is Implementation.CUDF: # pragma: no cover
native_frame = native_namespace.DataFrame.from_arrow(tbl)
elif implementation is Implementation.POLARS: # pragma: no cover
native_frame = native_namespace.from_arrow(tbl)
else: # pragma: no cover
msg = "congratulations, you entered unrecheable code - please report a bug"
raise AssertionError(msg)
elif implementation is Implementation.PYARROW:
native_frame = native_namespace.table(native_frame)
else: # pragma: no cover
try:
# implementation is UNKNOWN, Narwhals extension using this feature should
# implement PyCapsule support
native_frame = native_namespace.DataFrame(native_frame)
except AttributeError as e:
msg = "Unknown namespace is expected to implement `DataFrame` class which accepts object which supports PyCapsule Interface."
raise AttributeError(msg) from e
return from_native(native_frame, eager_only=True)
def _get_sys_info() -> dict[str, str]:
"""System information
Returns system and Python version information
Copied from sklearn
"""
python = sys.version.replace("\n", " ")
blob = (
("python", python),
("executable", sys.executable),
("machine", platform.platform()),
)
return dict(blob)
def _get_deps_info() -> dict[str, str]:
"""Overview of the installed version of main dependencies
This function does not import the modules to collect the version numbers
but instead relies on standard Python package metadata.
Returns version information on relevant Python libraries
This function and show_versions were copied from sklearn and adapted
"""
deps = (
"pandas",
"polars",
"cudf",
"modin",
"pyarrow",
"numpy",
)
from . import __version__
deps_info = {
"narwhals": __version__,
}
from importlib.metadata import PackageNotFoundError
from importlib.metadata import version
for modname in deps:
try:
deps_info[modname] = version(modname)
except PackageNotFoundError: # noqa: PERF203
deps_info[modname] = ""
return deps_info
def show_versions() -> None:
"""
Print useful debugging information
Examples:
>>> from narwhals import show_versions
>>> show_versions() # doctest: +SKIP
"""
sys_info = _get_sys_info()
deps_info = _get_deps_info()
print("\nSystem:") # noqa: T201
for k, stat in sys_info.items():
print(f"{k:>10}: {stat}") # noqa: T201
print("\nPython dependencies:") # noqa: T201
for k, stat in deps_info.items():
print(f"{k:>13}: {stat}") # noqa: T201
def get_level(
obj: DataFrame[Any] | LazyFrame[Any] | Series,
) -> Literal["full", "interchange"]:
"""
Level of support Narwhals has for current object.
This can be one of:
- 'full': full Narwhals API support
- 'metadata': only metadata operations are supported (`df.schema`)
"""
return obj._level