552 lines
20 KiB
Python
Executable File
552 lines
20 KiB
Python
Executable File
from __future__ import annotations
|
|
|
|
import re
|
|
from enum import Enum
|
|
from enum import auto
|
|
from secrets import token_hex
|
|
from typing import TYPE_CHECKING
|
|
from typing import Any
|
|
from typing import Iterable
|
|
from typing import Sequence
|
|
from typing import TypeVar
|
|
from typing import cast
|
|
from warnings import warn
|
|
|
|
from narwhals._exceptions import ColumnNotFoundError
|
|
from narwhals.dependencies import get_cudf
|
|
from narwhals.dependencies import get_dask_dataframe
|
|
from narwhals.dependencies import get_modin
|
|
from narwhals.dependencies import get_pandas
|
|
from narwhals.dependencies import get_polars
|
|
from narwhals.dependencies import get_pyarrow
|
|
from narwhals.dependencies import is_cudf_series
|
|
from narwhals.dependencies import is_modin_series
|
|
from narwhals.dependencies import is_pandas_dataframe
|
|
from narwhals.dependencies import is_pandas_like_dataframe
|
|
from narwhals.dependencies import is_pandas_like_series
|
|
from narwhals.dependencies import is_pandas_series
|
|
from narwhals.dependencies import is_polars_series
|
|
from narwhals.dependencies import is_pyarrow_chunked_array
|
|
from narwhals.translate import to_native
|
|
|
|
if TYPE_CHECKING:
|
|
from types import ModuleType
|
|
|
|
import pandas as pd
|
|
from typing_extensions import Self
|
|
from typing_extensions import TypeGuard
|
|
|
|
from narwhals.dataframe import BaseFrame
|
|
from narwhals.series import Series
|
|
|
|
T = TypeVar("T")
|
|
|
|
|
|
class Implementation(Enum):
|
|
PANDAS = auto()
|
|
MODIN = auto()
|
|
CUDF = auto()
|
|
PYARROW = auto()
|
|
POLARS = auto()
|
|
DASK = auto()
|
|
|
|
UNKNOWN = auto()
|
|
|
|
@classmethod
|
|
def from_native_namespace(
|
|
cls: type[Self], native_namespace: ModuleType
|
|
) -> Implementation: # pragma: no cover
|
|
"""Instantiate Implementation object from a native namespace module."""
|
|
mapping = {
|
|
get_pandas(): Implementation.PANDAS,
|
|
get_modin(): Implementation.MODIN,
|
|
get_cudf(): Implementation.CUDF,
|
|
get_pyarrow(): Implementation.PYARROW,
|
|
get_polars(): Implementation.POLARS,
|
|
get_dask_dataframe(): Implementation.DASK,
|
|
}
|
|
return mapping.get(native_namespace, Implementation.UNKNOWN)
|
|
|
|
def to_native_namespace(self: Self) -> ModuleType:
|
|
"""Return the native namespace module corresponding to Implementation."""
|
|
mapping = {
|
|
Implementation.PANDAS: get_pandas(),
|
|
Implementation.MODIN: get_modin(),
|
|
Implementation.CUDF: get_cudf(),
|
|
Implementation.PYARROW: get_pyarrow(),
|
|
Implementation.POLARS: get_polars(),
|
|
Implementation.DASK: get_dask_dataframe(),
|
|
}
|
|
return mapping[self] # type: ignore[no-any-return]
|
|
|
|
|
|
def remove_prefix(text: str, prefix: str) -> str:
|
|
if text.startswith(prefix):
|
|
return text[len(prefix) :]
|
|
return text # pragma: no cover
|
|
|
|
|
|
def remove_suffix(text: str, suffix: str) -> str: # pragma: no cover
|
|
if text.endswith(suffix):
|
|
return text[: -len(suffix)]
|
|
return text # pragma: no cover
|
|
|
|
|
|
def flatten(args: Any) -> list[Any]:
|
|
if not args:
|
|
return []
|
|
if len(args) == 1 and _is_iterable(args[0]):
|
|
return args[0] # type: ignore[no-any-return]
|
|
return args # type: ignore[no-any-return]
|
|
|
|
|
|
def tupleify(arg: Any) -> Any:
|
|
if not isinstance(arg, (list, tuple)): # pragma: no cover
|
|
return (arg,)
|
|
return arg
|
|
|
|
|
|
def _is_iterable(arg: Any | Iterable[Any]) -> bool:
|
|
from narwhals.series import Series
|
|
|
|
if is_pandas_dataframe(arg) or is_pandas_series(arg):
|
|
msg = f"Expected Narwhals class or scalar, got: {type(arg)}. Perhaps you forgot a `nw.from_native` somewhere?"
|
|
raise TypeError(msg)
|
|
if (pl := get_polars()) is not None and isinstance(
|
|
arg, (pl.Series, pl.Expr, pl.DataFrame, pl.LazyFrame)
|
|
):
|
|
msg = (
|
|
f"Expected Narwhals class or scalar, got: {type(arg)}.\n\n"
|
|
"Hint: Perhaps you\n"
|
|
"- forgot a `nw.from_native` somewhere?\n"
|
|
"- used `pl.col` instead of `nw.col`?"
|
|
)
|
|
raise TypeError(msg)
|
|
|
|
return isinstance(arg, Iterable) and not isinstance(arg, (str, bytes, Series))
|
|
|
|
|
|
def parse_version(version: Sequence[str | int]) -> tuple[int, ...]:
|
|
"""Simple version parser; split into a tuple of ints for comparison."""
|
|
# lifted from Polars
|
|
if isinstance(version, str): # pragma: no cover
|
|
version = version.split(".")
|
|
return tuple(int(re.sub(r"\D", "", str(v))) for v in version)
|
|
|
|
|
|
def isinstance_or_issubclass(obj: Any, cls: Any) -> bool:
|
|
from narwhals.dtypes import DType
|
|
|
|
if isinstance(obj, DType):
|
|
return isinstance(obj, cls)
|
|
return isinstance(obj, cls) or issubclass(obj, cls)
|
|
|
|
|
|
def validate_laziness(items: Iterable[Any]) -> None:
|
|
from narwhals.dataframe import DataFrame
|
|
from narwhals.dataframe import LazyFrame
|
|
|
|
if all(isinstance(item, DataFrame) for item in items) or (
|
|
all(isinstance(item, LazyFrame) for item in items)
|
|
):
|
|
return
|
|
msg = "The items to concatenate should either all be eager, or all lazy"
|
|
raise NotImplementedError(msg)
|
|
|
|
|
|
def maybe_align_index(lhs: T, rhs: Series | BaseFrame[Any]) -> T:
|
|
"""
|
|
Align `lhs` to the Index of `rhs`, if they're both pandas-like.
|
|
|
|
Notes:
|
|
This is only really intended for backwards-compatibility purposes,
|
|
for example if your library already aligns indices for users.
|
|
If you're designing a new library, we highly encourage you to not
|
|
rely on the Index.
|
|
For non-pandas-like inputs, this only checks that `lhs` and `rhs`
|
|
are the same length.
|
|
|
|
Examples:
|
|
>>> import pandas as pd
|
|
>>> import polars as pl
|
|
>>> import narwhals as nw
|
|
>>> df_pd = pd.DataFrame({"a": [1, 2]}, index=[3, 4])
|
|
>>> s_pd = pd.Series([6, 7], index=[4, 3])
|
|
>>> df = nw.from_native(df_pd)
|
|
>>> s = nw.from_native(s_pd, series_only=True)
|
|
>>> nw.to_native(nw.maybe_align_index(df, s))
|
|
a
|
|
4 2
|
|
3 1
|
|
"""
|
|
from narwhals._pandas_like.dataframe import PandasLikeDataFrame
|
|
from narwhals._pandas_like.series import PandasLikeSeries
|
|
|
|
def _validate_index(index: Any) -> None:
|
|
if not index.is_unique:
|
|
msg = "given index doesn't have a unique index"
|
|
raise ValueError(msg)
|
|
|
|
lhs_any = cast(Any, lhs)
|
|
rhs_any = cast(Any, rhs)
|
|
if isinstance(
|
|
getattr(lhs_any, "_compliant_frame", None), PandasLikeDataFrame
|
|
) and isinstance(getattr(rhs_any, "_compliant_frame", None), PandasLikeDataFrame):
|
|
_validate_index(lhs_any._compliant_frame._native_frame.index)
|
|
_validate_index(rhs_any._compliant_frame._native_frame.index)
|
|
return lhs_any._from_compliant_dataframe( # type: ignore[no-any-return]
|
|
lhs_any._compliant_frame._from_native_frame(
|
|
lhs_any._compliant_frame._native_frame.loc[
|
|
rhs_any._compliant_frame._native_frame.index
|
|
]
|
|
)
|
|
)
|
|
if isinstance(
|
|
getattr(lhs_any, "_compliant_frame", None), PandasLikeDataFrame
|
|
) and isinstance(getattr(rhs_any, "_compliant_series", None), PandasLikeSeries):
|
|
_validate_index(lhs_any._compliant_frame._native_frame.index)
|
|
_validate_index(rhs_any._compliant_series._native_series.index)
|
|
return lhs_any._from_compliant_dataframe( # type: ignore[no-any-return]
|
|
lhs_any._compliant_frame._from_native_frame(
|
|
lhs_any._compliant_frame._native_frame.loc[
|
|
rhs_any._compliant_series._native_series.index
|
|
]
|
|
)
|
|
)
|
|
if isinstance(
|
|
getattr(lhs_any, "_compliant_series", None), PandasLikeSeries
|
|
) and isinstance(getattr(rhs_any, "_compliant_frame", None), PandasLikeDataFrame):
|
|
_validate_index(lhs_any._compliant_series._native_series.index)
|
|
_validate_index(rhs_any._compliant_frame._native_frame.index)
|
|
return lhs_any._from_compliant_series( # type: ignore[no-any-return]
|
|
lhs_any._compliant_series._from_native_series(
|
|
lhs_any._compliant_series._native_series.loc[
|
|
rhs_any._compliant_frame._native_frame.index
|
|
]
|
|
)
|
|
)
|
|
if isinstance(
|
|
getattr(lhs_any, "_compliant_series", None), PandasLikeSeries
|
|
) and isinstance(getattr(rhs_any, "_compliant_series", None), PandasLikeSeries):
|
|
_validate_index(lhs_any._compliant_series._native_series.index)
|
|
_validate_index(rhs_any._compliant_series._native_series.index)
|
|
return lhs_any._from_compliant_series( # type: ignore[no-any-return]
|
|
lhs_any._compliant_series._from_native_series(
|
|
lhs_any._compliant_series._native_series.loc[
|
|
rhs_any._compliant_series._native_series.index
|
|
]
|
|
)
|
|
)
|
|
if len(lhs_any) != len(rhs_any):
|
|
msg = f"Expected `lhs` and `rhs` to have the same length, got {len(lhs_any)} and {len(rhs_any)}"
|
|
raise ValueError(msg)
|
|
return lhs
|
|
|
|
|
|
def maybe_get_index(obj: T) -> Any | None:
|
|
"""
|
|
Get the index of a DataFrame or a Series, if it's pandas-like.
|
|
|
|
Notes:
|
|
This is only really intended for backwards-compatibility purposes,
|
|
for example if your library already aligns indices for users.
|
|
If you're designing a new library, we highly encourage you to not
|
|
rely on the Index.
|
|
For non-pandas-like inputs, this returns `None`.
|
|
|
|
Examples:
|
|
>>> import pandas as pd
|
|
>>> import polars as pl
|
|
>>> import narwhals as nw
|
|
>>> df_pd = pd.DataFrame({"a": [1, 2], "b": [4, 5]})
|
|
>>> df = nw.from_native(df_pd)
|
|
>>> nw.maybe_get_index(df)
|
|
RangeIndex(start=0, stop=2, step=1)
|
|
>>> series_pd = pd.Series([1, 2])
|
|
>>> series = nw.from_native(series_pd, series_only=True)
|
|
>>> nw.maybe_get_index(series)
|
|
RangeIndex(start=0, stop=2, step=1)
|
|
"""
|
|
obj_any = cast(Any, obj)
|
|
native_obj = to_native(obj_any)
|
|
if is_pandas_like_dataframe(native_obj) or is_pandas_like_series(native_obj):
|
|
return native_obj.index
|
|
return None
|
|
|
|
|
|
def maybe_set_index(df: T, column_names: str | list[str]) -> T:
|
|
"""
|
|
Set columns `columns` to be the index of `df`, if `df` is pandas-like.
|
|
|
|
Notes:
|
|
This is only really intended for backwards-compatibility purposes,
|
|
for example if your library already aligns indices for users.
|
|
If you're designing a new library, we highly encourage you to not
|
|
rely on the Index.
|
|
For non-pandas-like inputs, this is a no-op.
|
|
|
|
Examples:
|
|
>>> import pandas as pd
|
|
>>> import polars as pl
|
|
>>> import narwhals as nw
|
|
>>> df_pd = pd.DataFrame({"a": [1, 2], "b": [4, 5]})
|
|
>>> df = nw.from_native(df_pd)
|
|
>>> nw.to_native(nw.maybe_set_index(df, "b")) # doctest: +NORMALIZE_WHITESPACE
|
|
a
|
|
b
|
|
4 1
|
|
5 2
|
|
"""
|
|
df_any = cast(Any, df)
|
|
native_frame = to_native(df_any)
|
|
if is_pandas_like_dataframe(native_frame):
|
|
return df_any._from_compliant_dataframe( # type: ignore[no-any-return]
|
|
df_any._compliant_frame._from_native_frame(
|
|
native_frame.set_index(column_names)
|
|
)
|
|
)
|
|
return df_any # type: ignore[no-any-return]
|
|
|
|
|
|
def maybe_reset_index(obj: T) -> T:
|
|
"""
|
|
Reset the index to the default integer index of a DataFrame or a Series, if it's pandas-like.
|
|
|
|
Notes:
|
|
This is only really intended for backwards-compatibility purposes,
|
|
for example if your library already resets the index for users.
|
|
If you're designing a new library, we highly encourage you to not
|
|
rely on the Index.
|
|
For non-pandas-like inputs, this is a no-op.
|
|
|
|
Examples:
|
|
>>> import pandas as pd
|
|
>>> import polars as pl
|
|
>>> import narwhals as nw
|
|
>>> df_pd = pd.DataFrame({"a": [1, 2], "b": [4, 5]}, index=([6, 7]))
|
|
>>> df = nw.from_native(df_pd)
|
|
>>> nw.to_native(nw.maybe_reset_index(df))
|
|
a b
|
|
0 1 4
|
|
1 2 5
|
|
>>> series_pd = pd.Series([1, 2])
|
|
>>> series = nw.from_native(series_pd, series_only=True)
|
|
>>> nw.maybe_get_index(series)
|
|
RangeIndex(start=0, stop=2, step=1)
|
|
"""
|
|
obj_any = cast(Any, obj)
|
|
native_obj = to_native(obj_any)
|
|
if is_pandas_like_dataframe(native_obj):
|
|
native_namespace = obj_any.__native_namespace__()
|
|
if _has_default_index(native_obj, native_namespace):
|
|
return obj_any # type: ignore[no-any-return]
|
|
return obj_any._from_compliant_dataframe( # type: ignore[no-any-return]
|
|
obj_any._compliant_frame._from_native_frame(native_obj.reset_index(drop=True))
|
|
)
|
|
if is_pandas_like_series(native_obj):
|
|
native_namespace = obj_any.__native_namespace__()
|
|
if _has_default_index(native_obj, native_namespace):
|
|
return obj_any # type: ignore[no-any-return]
|
|
return obj_any._from_compliant_series( # type: ignore[no-any-return]
|
|
obj_any._compliant_series._from_native_series(
|
|
native_obj.reset_index(drop=True)
|
|
)
|
|
)
|
|
return obj_any # type: ignore[no-any-return]
|
|
|
|
|
|
def _has_default_index(
|
|
native_frame_or_series: pd.Series | pd.DataFrame, native_namespace: Any
|
|
) -> bool:
|
|
index = native_frame_or_series.index
|
|
return (
|
|
isinstance(index, native_namespace.RangeIndex)
|
|
and index.start == 0
|
|
and index.stop == len(index)
|
|
and index.step == 1
|
|
)
|
|
|
|
|
|
def maybe_convert_dtypes(obj: T, *args: bool, **kwargs: bool | str) -> T:
|
|
"""
|
|
Convert columns or series to the best possible dtypes using dtypes supporting ``pd.NA``, if df is pandas-like.
|
|
|
|
Arguments:
|
|
obj: DataFrame or Series.
|
|
*args: Additional arguments which gets passed through.
|
|
**kwargs: Additional arguments which gets passed through.
|
|
|
|
Notes:
|
|
For non-pandas-like inputs, this is a no-op.
|
|
Also, `args` and `kwargs` just get passed down to the underlying library as-is.
|
|
|
|
Examples:
|
|
>>> import pandas as pd
|
|
>>> import polars as pl
|
|
>>> import narwhals as nw
|
|
>>> import numpy as np
|
|
>>> df_pd = pd.DataFrame(
|
|
... {
|
|
... "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
|
|
... "b": pd.Series([True, False, np.nan], dtype=np.dtype("O")),
|
|
... }
|
|
... )
|
|
>>> df = nw.from_native(df_pd)
|
|
>>> nw.to_native(nw.maybe_convert_dtypes(df)).dtypes # doctest: +NORMALIZE_WHITESPACE
|
|
a Int32
|
|
b boolean
|
|
dtype: object
|
|
"""
|
|
obj_any = cast(Any, obj)
|
|
native_obj = to_native(obj_any)
|
|
if is_pandas_like_dataframe(native_obj):
|
|
return obj_any._from_compliant_dataframe( # type: ignore[no-any-return]
|
|
obj_any._compliant_frame._from_native_frame(
|
|
native_obj.convert_dtypes(*args, **kwargs)
|
|
)
|
|
)
|
|
if is_pandas_like_series(native_obj):
|
|
return obj_any._from_compliant_series( # type: ignore[no-any-return]
|
|
obj_any._compliant_series._from_native_series(
|
|
native_obj.convert_dtypes(*args, **kwargs)
|
|
)
|
|
)
|
|
return obj_any # type: ignore[no-any-return]
|
|
|
|
|
|
def is_ordered_categorical(series: Series) -> bool:
|
|
"""
|
|
Return whether indices of categories are semantically meaningful.
|
|
|
|
This is a convenience function to accessing what would otherwise be
|
|
the `is_ordered` property from the DataFrame Interchange Protocol,
|
|
see https://data-apis.org/dataframe-protocol/latest/API.html.
|
|
|
|
- For Polars:
|
|
- Enums are always ordered.
|
|
- Categoricals are ordered if `dtype.ordering == "physical"`.
|
|
- For pandas-like APIs:
|
|
- Categoricals are ordered if `dtype.cat.ordered == True`.
|
|
- For PyArrow table:
|
|
- Categoricals are ordered if `dtype.type.ordered == True`.
|
|
|
|
Examples:
|
|
>>> import narwhals as nw
|
|
>>> import pandas as pd
|
|
>>> import polars as pl
|
|
>>> data = ["x", "y"]
|
|
>>> s_pd = pd.Series(data, dtype=pd.CategoricalDtype(ordered=True))
|
|
>>> s_pl = pl.Series(data, dtype=pl.Categorical(ordering="physical"))
|
|
|
|
Let's define a library-agnostic function:
|
|
|
|
>>> @nw.narwhalify
|
|
... def func(s):
|
|
... return nw.is_ordered_categorical(s)
|
|
|
|
Then, we can pass any supported library to `func`:
|
|
|
|
>>> func(s_pd)
|
|
True
|
|
>>> func(s_pl)
|
|
True
|
|
"""
|
|
from narwhals._interchange.series import InterchangeSeries
|
|
|
|
dtypes = series._compliant_series._dtypes
|
|
|
|
if (
|
|
isinstance(series._compliant_series, InterchangeSeries)
|
|
and series.dtype == dtypes.Categorical
|
|
):
|
|
return series._compliant_series._native_series.describe_categorical[ # type: ignore[no-any-return]
|
|
"is_ordered"
|
|
]
|
|
if series.dtype == dtypes.Enum:
|
|
return True
|
|
if series.dtype != dtypes.Categorical:
|
|
return False
|
|
native_series = to_native(series)
|
|
if is_polars_series(native_series):
|
|
return native_series.dtype.ordering == "physical" # type: ignore[attr-defined, no-any-return]
|
|
if is_pandas_series(native_series):
|
|
return native_series.cat.ordered # type: ignore[no-any-return]
|
|
if is_modin_series(native_series): # pragma: no cover
|
|
return native_series.cat.ordered # type: ignore[no-any-return]
|
|
if is_cudf_series(native_series): # pragma: no cover
|
|
return native_series.cat.ordered # type: ignore[no-any-return]
|
|
if is_pyarrow_chunked_array(native_series):
|
|
return native_series.type.ordered # type: ignore[no-any-return]
|
|
# If it doesn't match any of the above, let's just play it safe and return False.
|
|
return False # pragma: no cover
|
|
|
|
|
|
def generate_unique_token(n_bytes: int, columns: list[str]) -> str: # pragma: no cover
|
|
warn(
|
|
"Use `generate_temporary_column_name` instead. `generate_unique_token` is "
|
|
"deprecated and it will be removed in future versions",
|
|
DeprecationWarning,
|
|
stacklevel=2,
|
|
)
|
|
return generate_temporary_column_name(n_bytes=n_bytes, columns=columns)
|
|
|
|
|
|
def generate_temporary_column_name(n_bytes: int, columns: list[str]) -> str:
|
|
"""Generates a unique token of specified `n_bytes` that is not present in the given
|
|
list of columns.
|
|
|
|
It relies on [python secrets token_hex](https://docs.python.org/3/library/secrets.html#secrets.token_hex)
|
|
function to return a string nbytes random bytes.
|
|
|
|
Arguments:
|
|
n_bytes: The number of bytes to generate for the token.
|
|
columns: The list of columns to check for uniqueness.
|
|
|
|
Returns:
|
|
A unique token that is not present in the given list of columns.
|
|
|
|
Raises:
|
|
AssertionError: If a unique token cannot be generated after 100 attempts.
|
|
|
|
Examples:
|
|
>>> import narwhals as nw
|
|
>>> columns = ["abc", "xyz"]
|
|
>>> nw.generate_temporary_column_name(n_bytes=8, columns=columns) not in columns
|
|
True
|
|
"""
|
|
counter = 0
|
|
while True:
|
|
token = token_hex(n_bytes)
|
|
if token not in columns:
|
|
return token
|
|
|
|
counter += 1
|
|
if counter > 100:
|
|
msg = (
|
|
"Internal Error: Narwhals was not able to generate a column name with "
|
|
f"{n_bytes=} and not in {columns}"
|
|
)
|
|
raise AssertionError(msg)
|
|
|
|
|
|
def parse_columns_to_drop(
|
|
compliant_frame: Any,
|
|
columns: Iterable[str],
|
|
strict: bool, # noqa: FBT001
|
|
) -> list[str]:
|
|
cols = set(compliant_frame.columns)
|
|
to_drop = list(columns)
|
|
|
|
if strict:
|
|
for d in to_drop:
|
|
if d not in cols:
|
|
msg = f'"{d}" not found'
|
|
raise ColumnNotFoundError(msg)
|
|
else:
|
|
to_drop = list(cols.intersection(set(to_drop)))
|
|
return to_drop
|
|
|
|
|
|
def is_sequence_but_not_str(sequence: Any) -> TypeGuard[Sequence[Any]]:
|
|
return isinstance(sequence, Sequence) and not isinstance(sequence, str)
|