Files
Buffteks-Website/streamlit-venv/lib/python3.10/site-packages/narwhals/utils.py
2025-01-10 21:40:35 +00:00

552 lines
20 KiB
Python
Executable File

from __future__ import annotations
import re
from enum import Enum
from enum import auto
from secrets import token_hex
from typing import TYPE_CHECKING
from typing import Any
from typing import Iterable
from typing import Sequence
from typing import TypeVar
from typing import cast
from warnings import warn
from narwhals._exceptions import ColumnNotFoundError
from narwhals.dependencies import get_cudf
from narwhals.dependencies import get_dask_dataframe
from narwhals.dependencies import get_modin
from narwhals.dependencies import get_pandas
from narwhals.dependencies import get_polars
from narwhals.dependencies import get_pyarrow
from narwhals.dependencies import is_cudf_series
from narwhals.dependencies import is_modin_series
from narwhals.dependencies import is_pandas_dataframe
from narwhals.dependencies import is_pandas_like_dataframe
from narwhals.dependencies import is_pandas_like_series
from narwhals.dependencies import is_pandas_series
from narwhals.dependencies import is_polars_series
from narwhals.dependencies import is_pyarrow_chunked_array
from narwhals.translate import to_native
if TYPE_CHECKING:
from types import ModuleType
import pandas as pd
from typing_extensions import Self
from typing_extensions import TypeGuard
from narwhals.dataframe import BaseFrame
from narwhals.series import Series
T = TypeVar("T")
class Implementation(Enum):
PANDAS = auto()
MODIN = auto()
CUDF = auto()
PYARROW = auto()
POLARS = auto()
DASK = auto()
UNKNOWN = auto()
@classmethod
def from_native_namespace(
cls: type[Self], native_namespace: ModuleType
) -> Implementation: # pragma: no cover
"""Instantiate Implementation object from a native namespace module."""
mapping = {
get_pandas(): Implementation.PANDAS,
get_modin(): Implementation.MODIN,
get_cudf(): Implementation.CUDF,
get_pyarrow(): Implementation.PYARROW,
get_polars(): Implementation.POLARS,
get_dask_dataframe(): Implementation.DASK,
}
return mapping.get(native_namespace, Implementation.UNKNOWN)
def to_native_namespace(self: Self) -> ModuleType:
"""Return the native namespace module corresponding to Implementation."""
mapping = {
Implementation.PANDAS: get_pandas(),
Implementation.MODIN: get_modin(),
Implementation.CUDF: get_cudf(),
Implementation.PYARROW: get_pyarrow(),
Implementation.POLARS: get_polars(),
Implementation.DASK: get_dask_dataframe(),
}
return mapping[self] # type: ignore[no-any-return]
def remove_prefix(text: str, prefix: str) -> str:
if text.startswith(prefix):
return text[len(prefix) :]
return text # pragma: no cover
def remove_suffix(text: str, suffix: str) -> str: # pragma: no cover
if text.endswith(suffix):
return text[: -len(suffix)]
return text # pragma: no cover
def flatten(args: Any) -> list[Any]:
if not args:
return []
if len(args) == 1 and _is_iterable(args[0]):
return args[0] # type: ignore[no-any-return]
return args # type: ignore[no-any-return]
def tupleify(arg: Any) -> Any:
if not isinstance(arg, (list, tuple)): # pragma: no cover
return (arg,)
return arg
def _is_iterable(arg: Any | Iterable[Any]) -> bool:
from narwhals.series import Series
if is_pandas_dataframe(arg) or is_pandas_series(arg):
msg = f"Expected Narwhals class or scalar, got: {type(arg)}. Perhaps you forgot a `nw.from_native` somewhere?"
raise TypeError(msg)
if (pl := get_polars()) is not None and isinstance(
arg, (pl.Series, pl.Expr, pl.DataFrame, pl.LazyFrame)
):
msg = (
f"Expected Narwhals class or scalar, got: {type(arg)}.\n\n"
"Hint: Perhaps you\n"
"- forgot a `nw.from_native` somewhere?\n"
"- used `pl.col` instead of `nw.col`?"
)
raise TypeError(msg)
return isinstance(arg, Iterable) and not isinstance(arg, (str, bytes, Series))
def parse_version(version: Sequence[str | int]) -> tuple[int, ...]:
"""Simple version parser; split into a tuple of ints for comparison."""
# lifted from Polars
if isinstance(version, str): # pragma: no cover
version = version.split(".")
return tuple(int(re.sub(r"\D", "", str(v))) for v in version)
def isinstance_or_issubclass(obj: Any, cls: Any) -> bool:
from narwhals.dtypes import DType
if isinstance(obj, DType):
return isinstance(obj, cls)
return isinstance(obj, cls) or issubclass(obj, cls)
def validate_laziness(items: Iterable[Any]) -> None:
from narwhals.dataframe import DataFrame
from narwhals.dataframe import LazyFrame
if all(isinstance(item, DataFrame) for item in items) or (
all(isinstance(item, LazyFrame) for item in items)
):
return
msg = "The items to concatenate should either all be eager, or all lazy"
raise NotImplementedError(msg)
def maybe_align_index(lhs: T, rhs: Series | BaseFrame[Any]) -> T:
"""
Align `lhs` to the Index of `rhs`, if they're both pandas-like.
Notes:
This is only really intended for backwards-compatibility purposes,
for example if your library already aligns indices for users.
If you're designing a new library, we highly encourage you to not
rely on the Index.
For non-pandas-like inputs, this only checks that `lhs` and `rhs`
are the same length.
Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import narwhals as nw
>>> df_pd = pd.DataFrame({"a": [1, 2]}, index=[3, 4])
>>> s_pd = pd.Series([6, 7], index=[4, 3])
>>> df = nw.from_native(df_pd)
>>> s = nw.from_native(s_pd, series_only=True)
>>> nw.to_native(nw.maybe_align_index(df, s))
a
4 2
3 1
"""
from narwhals._pandas_like.dataframe import PandasLikeDataFrame
from narwhals._pandas_like.series import PandasLikeSeries
def _validate_index(index: Any) -> None:
if not index.is_unique:
msg = "given index doesn't have a unique index"
raise ValueError(msg)
lhs_any = cast(Any, lhs)
rhs_any = cast(Any, rhs)
if isinstance(
getattr(lhs_any, "_compliant_frame", None), PandasLikeDataFrame
) and isinstance(getattr(rhs_any, "_compliant_frame", None), PandasLikeDataFrame):
_validate_index(lhs_any._compliant_frame._native_frame.index)
_validate_index(rhs_any._compliant_frame._native_frame.index)
return lhs_any._from_compliant_dataframe( # type: ignore[no-any-return]
lhs_any._compliant_frame._from_native_frame(
lhs_any._compliant_frame._native_frame.loc[
rhs_any._compliant_frame._native_frame.index
]
)
)
if isinstance(
getattr(lhs_any, "_compliant_frame", None), PandasLikeDataFrame
) and isinstance(getattr(rhs_any, "_compliant_series", None), PandasLikeSeries):
_validate_index(lhs_any._compliant_frame._native_frame.index)
_validate_index(rhs_any._compliant_series._native_series.index)
return lhs_any._from_compliant_dataframe( # type: ignore[no-any-return]
lhs_any._compliant_frame._from_native_frame(
lhs_any._compliant_frame._native_frame.loc[
rhs_any._compliant_series._native_series.index
]
)
)
if isinstance(
getattr(lhs_any, "_compliant_series", None), PandasLikeSeries
) and isinstance(getattr(rhs_any, "_compliant_frame", None), PandasLikeDataFrame):
_validate_index(lhs_any._compliant_series._native_series.index)
_validate_index(rhs_any._compliant_frame._native_frame.index)
return lhs_any._from_compliant_series( # type: ignore[no-any-return]
lhs_any._compliant_series._from_native_series(
lhs_any._compliant_series._native_series.loc[
rhs_any._compliant_frame._native_frame.index
]
)
)
if isinstance(
getattr(lhs_any, "_compliant_series", None), PandasLikeSeries
) and isinstance(getattr(rhs_any, "_compliant_series", None), PandasLikeSeries):
_validate_index(lhs_any._compliant_series._native_series.index)
_validate_index(rhs_any._compliant_series._native_series.index)
return lhs_any._from_compliant_series( # type: ignore[no-any-return]
lhs_any._compliant_series._from_native_series(
lhs_any._compliant_series._native_series.loc[
rhs_any._compliant_series._native_series.index
]
)
)
if len(lhs_any) != len(rhs_any):
msg = f"Expected `lhs` and `rhs` to have the same length, got {len(lhs_any)} and {len(rhs_any)}"
raise ValueError(msg)
return lhs
def maybe_get_index(obj: T) -> Any | None:
"""
Get the index of a DataFrame or a Series, if it's pandas-like.
Notes:
This is only really intended for backwards-compatibility purposes,
for example if your library already aligns indices for users.
If you're designing a new library, we highly encourage you to not
rely on the Index.
For non-pandas-like inputs, this returns `None`.
Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import narwhals as nw
>>> df_pd = pd.DataFrame({"a": [1, 2], "b": [4, 5]})
>>> df = nw.from_native(df_pd)
>>> nw.maybe_get_index(df)
RangeIndex(start=0, stop=2, step=1)
>>> series_pd = pd.Series([1, 2])
>>> series = nw.from_native(series_pd, series_only=True)
>>> nw.maybe_get_index(series)
RangeIndex(start=0, stop=2, step=1)
"""
obj_any = cast(Any, obj)
native_obj = to_native(obj_any)
if is_pandas_like_dataframe(native_obj) or is_pandas_like_series(native_obj):
return native_obj.index
return None
def maybe_set_index(df: T, column_names: str | list[str]) -> T:
"""
Set columns `columns` to be the index of `df`, if `df` is pandas-like.
Notes:
This is only really intended for backwards-compatibility purposes,
for example if your library already aligns indices for users.
If you're designing a new library, we highly encourage you to not
rely on the Index.
For non-pandas-like inputs, this is a no-op.
Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import narwhals as nw
>>> df_pd = pd.DataFrame({"a": [1, 2], "b": [4, 5]})
>>> df = nw.from_native(df_pd)
>>> nw.to_native(nw.maybe_set_index(df, "b")) # doctest: +NORMALIZE_WHITESPACE
a
b
4 1
5 2
"""
df_any = cast(Any, df)
native_frame = to_native(df_any)
if is_pandas_like_dataframe(native_frame):
return df_any._from_compliant_dataframe( # type: ignore[no-any-return]
df_any._compliant_frame._from_native_frame(
native_frame.set_index(column_names)
)
)
return df_any # type: ignore[no-any-return]
def maybe_reset_index(obj: T) -> T:
"""
Reset the index to the default integer index of a DataFrame or a Series, if it's pandas-like.
Notes:
This is only really intended for backwards-compatibility purposes,
for example if your library already resets the index for users.
If you're designing a new library, we highly encourage you to not
rely on the Index.
For non-pandas-like inputs, this is a no-op.
Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import narwhals as nw
>>> df_pd = pd.DataFrame({"a": [1, 2], "b": [4, 5]}, index=([6, 7]))
>>> df = nw.from_native(df_pd)
>>> nw.to_native(nw.maybe_reset_index(df))
a b
0 1 4
1 2 5
>>> series_pd = pd.Series([1, 2])
>>> series = nw.from_native(series_pd, series_only=True)
>>> nw.maybe_get_index(series)
RangeIndex(start=0, stop=2, step=1)
"""
obj_any = cast(Any, obj)
native_obj = to_native(obj_any)
if is_pandas_like_dataframe(native_obj):
native_namespace = obj_any.__native_namespace__()
if _has_default_index(native_obj, native_namespace):
return obj_any # type: ignore[no-any-return]
return obj_any._from_compliant_dataframe( # type: ignore[no-any-return]
obj_any._compliant_frame._from_native_frame(native_obj.reset_index(drop=True))
)
if is_pandas_like_series(native_obj):
native_namespace = obj_any.__native_namespace__()
if _has_default_index(native_obj, native_namespace):
return obj_any # type: ignore[no-any-return]
return obj_any._from_compliant_series( # type: ignore[no-any-return]
obj_any._compliant_series._from_native_series(
native_obj.reset_index(drop=True)
)
)
return obj_any # type: ignore[no-any-return]
def _has_default_index(
native_frame_or_series: pd.Series | pd.DataFrame, native_namespace: Any
) -> bool:
index = native_frame_or_series.index
return (
isinstance(index, native_namespace.RangeIndex)
and index.start == 0
and index.stop == len(index)
and index.step == 1
)
def maybe_convert_dtypes(obj: T, *args: bool, **kwargs: bool | str) -> T:
"""
Convert columns or series to the best possible dtypes using dtypes supporting ``pd.NA``, if df is pandas-like.
Arguments:
obj: DataFrame or Series.
*args: Additional arguments which gets passed through.
**kwargs: Additional arguments which gets passed through.
Notes:
For non-pandas-like inputs, this is a no-op.
Also, `args` and `kwargs` just get passed down to the underlying library as-is.
Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import narwhals as nw
>>> import numpy as np
>>> df_pd = pd.DataFrame(
... {
... "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
... "b": pd.Series([True, False, np.nan], dtype=np.dtype("O")),
... }
... )
>>> df = nw.from_native(df_pd)
>>> nw.to_native(nw.maybe_convert_dtypes(df)).dtypes # doctest: +NORMALIZE_WHITESPACE
a Int32
b boolean
dtype: object
"""
obj_any = cast(Any, obj)
native_obj = to_native(obj_any)
if is_pandas_like_dataframe(native_obj):
return obj_any._from_compliant_dataframe( # type: ignore[no-any-return]
obj_any._compliant_frame._from_native_frame(
native_obj.convert_dtypes(*args, **kwargs)
)
)
if is_pandas_like_series(native_obj):
return obj_any._from_compliant_series( # type: ignore[no-any-return]
obj_any._compliant_series._from_native_series(
native_obj.convert_dtypes(*args, **kwargs)
)
)
return obj_any # type: ignore[no-any-return]
def is_ordered_categorical(series: Series) -> bool:
"""
Return whether indices of categories are semantically meaningful.
This is a convenience function to accessing what would otherwise be
the `is_ordered` property from the DataFrame Interchange Protocol,
see https://data-apis.org/dataframe-protocol/latest/API.html.
- For Polars:
- Enums are always ordered.
- Categoricals are ordered if `dtype.ordering == "physical"`.
- For pandas-like APIs:
- Categoricals are ordered if `dtype.cat.ordered == True`.
- For PyArrow table:
- Categoricals are ordered if `dtype.type.ordered == True`.
Examples:
>>> import narwhals as nw
>>> import pandas as pd
>>> import polars as pl
>>> data = ["x", "y"]
>>> s_pd = pd.Series(data, dtype=pd.CategoricalDtype(ordered=True))
>>> s_pl = pl.Series(data, dtype=pl.Categorical(ordering="physical"))
Let's define a library-agnostic function:
>>> @nw.narwhalify
... def func(s):
... return nw.is_ordered_categorical(s)
Then, we can pass any supported library to `func`:
>>> func(s_pd)
True
>>> func(s_pl)
True
"""
from narwhals._interchange.series import InterchangeSeries
dtypes = series._compliant_series._dtypes
if (
isinstance(series._compliant_series, InterchangeSeries)
and series.dtype == dtypes.Categorical
):
return series._compliant_series._native_series.describe_categorical[ # type: ignore[no-any-return]
"is_ordered"
]
if series.dtype == dtypes.Enum:
return True
if series.dtype != dtypes.Categorical:
return False
native_series = to_native(series)
if is_polars_series(native_series):
return native_series.dtype.ordering == "physical" # type: ignore[attr-defined, no-any-return]
if is_pandas_series(native_series):
return native_series.cat.ordered # type: ignore[no-any-return]
if is_modin_series(native_series): # pragma: no cover
return native_series.cat.ordered # type: ignore[no-any-return]
if is_cudf_series(native_series): # pragma: no cover
return native_series.cat.ordered # type: ignore[no-any-return]
if is_pyarrow_chunked_array(native_series):
return native_series.type.ordered # type: ignore[no-any-return]
# If it doesn't match any of the above, let's just play it safe and return False.
return False # pragma: no cover
def generate_unique_token(n_bytes: int, columns: list[str]) -> str: # pragma: no cover
warn(
"Use `generate_temporary_column_name` instead. `generate_unique_token` is "
"deprecated and it will be removed in future versions",
DeprecationWarning,
stacklevel=2,
)
return generate_temporary_column_name(n_bytes=n_bytes, columns=columns)
def generate_temporary_column_name(n_bytes: int, columns: list[str]) -> str:
"""Generates a unique token of specified `n_bytes` that is not present in the given
list of columns.
It relies on [python secrets token_hex](https://docs.python.org/3/library/secrets.html#secrets.token_hex)
function to return a string nbytes random bytes.
Arguments:
n_bytes: The number of bytes to generate for the token.
columns: The list of columns to check for uniqueness.
Returns:
A unique token that is not present in the given list of columns.
Raises:
AssertionError: If a unique token cannot be generated after 100 attempts.
Examples:
>>> import narwhals as nw
>>> columns = ["abc", "xyz"]
>>> nw.generate_temporary_column_name(n_bytes=8, columns=columns) not in columns
True
"""
counter = 0
while True:
token = token_hex(n_bytes)
if token not in columns:
return token
counter += 1
if counter > 100:
msg = (
"Internal Error: Narwhals was not able to generate a column name with "
f"{n_bytes=} and not in {columns}"
)
raise AssertionError(msg)
def parse_columns_to_drop(
compliant_frame: Any,
columns: Iterable[str],
strict: bool, # noqa: FBT001
) -> list[str]:
cols = set(compliant_frame.columns)
to_drop = list(columns)
if strict:
for d in to_drop:
if d not in cols:
msg = f'"{d}" not found'
raise ColumnNotFoundError(msg)
else:
to_drop = list(cols.intersection(set(to_drop)))
return to_drop
def is_sequence_but_not_str(sequence: Any) -> TypeGuard[Sequence[Any]]:
return isinstance(sequence, Sequence) and not isinstance(sequence, str)