from __future__ import annotations import re from enum import Enum from enum import auto from secrets import token_hex from typing import TYPE_CHECKING from typing import Any from typing import Iterable from typing import Sequence from typing import TypeVar from typing import cast from warnings import warn from narwhals._exceptions import ColumnNotFoundError from narwhals.dependencies import get_cudf from narwhals.dependencies import get_dask_dataframe from narwhals.dependencies import get_modin from narwhals.dependencies import get_pandas from narwhals.dependencies import get_polars from narwhals.dependencies import get_pyarrow from narwhals.dependencies import is_cudf_series from narwhals.dependencies import is_modin_series from narwhals.dependencies import is_pandas_dataframe from narwhals.dependencies import is_pandas_like_dataframe from narwhals.dependencies import is_pandas_like_series from narwhals.dependencies import is_pandas_series from narwhals.dependencies import is_polars_series from narwhals.dependencies import is_pyarrow_chunked_array from narwhals.translate import to_native if TYPE_CHECKING: from types import ModuleType import pandas as pd from typing_extensions import Self from typing_extensions import TypeGuard from narwhals.dataframe import BaseFrame from narwhals.series import Series T = TypeVar("T") class Implementation(Enum): PANDAS = auto() MODIN = auto() CUDF = auto() PYARROW = auto() POLARS = auto() DASK = auto() UNKNOWN = auto() @classmethod def from_native_namespace( cls: type[Self], native_namespace: ModuleType ) -> Implementation: # pragma: no cover """Instantiate Implementation object from a native namespace module.""" mapping = { get_pandas(): Implementation.PANDAS, get_modin(): Implementation.MODIN, get_cudf(): Implementation.CUDF, get_pyarrow(): Implementation.PYARROW, get_polars(): Implementation.POLARS, get_dask_dataframe(): Implementation.DASK, } return mapping.get(native_namespace, Implementation.UNKNOWN) def to_native_namespace(self: Self) -> ModuleType: """Return the native namespace module corresponding to Implementation.""" mapping = { Implementation.PANDAS: get_pandas(), Implementation.MODIN: get_modin(), Implementation.CUDF: get_cudf(), Implementation.PYARROW: get_pyarrow(), Implementation.POLARS: get_polars(), Implementation.DASK: get_dask_dataframe(), } return mapping[self] # type: ignore[no-any-return] def remove_prefix(text: str, prefix: str) -> str: if text.startswith(prefix): return text[len(prefix) :] return text # pragma: no cover def remove_suffix(text: str, suffix: str) -> str: # pragma: no cover if text.endswith(suffix): return text[: -len(suffix)] return text # pragma: no cover def flatten(args: Any) -> list[Any]: if not args: return [] if len(args) == 1 and _is_iterable(args[0]): return args[0] # type: ignore[no-any-return] return args # type: ignore[no-any-return] def tupleify(arg: Any) -> Any: if not isinstance(arg, (list, tuple)): # pragma: no cover return (arg,) return arg def _is_iterable(arg: Any | Iterable[Any]) -> bool: from narwhals.series import Series if is_pandas_dataframe(arg) or is_pandas_series(arg): msg = f"Expected Narwhals class or scalar, got: {type(arg)}. Perhaps you forgot a `nw.from_native` somewhere?" raise TypeError(msg) if (pl := get_polars()) is not None and isinstance( arg, (pl.Series, pl.Expr, pl.DataFrame, pl.LazyFrame) ): msg = ( f"Expected Narwhals class or scalar, got: {type(arg)}.\n\n" "Hint: Perhaps you\n" "- forgot a `nw.from_native` somewhere?\n" "- used `pl.col` instead of `nw.col`?" ) raise TypeError(msg) return isinstance(arg, Iterable) and not isinstance(arg, (str, bytes, Series)) def parse_version(version: Sequence[str | int]) -> tuple[int, ...]: """Simple version parser; split into a tuple of ints for comparison.""" # lifted from Polars if isinstance(version, str): # pragma: no cover version = version.split(".") return tuple(int(re.sub(r"\D", "", str(v))) for v in version) def isinstance_or_issubclass(obj: Any, cls: Any) -> bool: from narwhals.dtypes import DType if isinstance(obj, DType): return isinstance(obj, cls) return isinstance(obj, cls) or issubclass(obj, cls) def validate_laziness(items: Iterable[Any]) -> None: from narwhals.dataframe import DataFrame from narwhals.dataframe import LazyFrame if all(isinstance(item, DataFrame) for item in items) or ( all(isinstance(item, LazyFrame) for item in items) ): return msg = "The items to concatenate should either all be eager, or all lazy" raise NotImplementedError(msg) def maybe_align_index(lhs: T, rhs: Series | BaseFrame[Any]) -> T: """ Align `lhs` to the Index of `rhs`, if they're both pandas-like. Notes: This is only really intended for backwards-compatibility purposes, for example if your library already aligns indices for users. If you're designing a new library, we highly encourage you to not rely on the Index. For non-pandas-like inputs, this only checks that `lhs` and `rhs` are the same length. Examples: >>> import pandas as pd >>> import polars as pl >>> import narwhals as nw >>> df_pd = pd.DataFrame({"a": [1, 2]}, index=[3, 4]) >>> s_pd = pd.Series([6, 7], index=[4, 3]) >>> df = nw.from_native(df_pd) >>> s = nw.from_native(s_pd, series_only=True) >>> nw.to_native(nw.maybe_align_index(df, s)) a 4 2 3 1 """ from narwhals._pandas_like.dataframe import PandasLikeDataFrame from narwhals._pandas_like.series import PandasLikeSeries def _validate_index(index: Any) -> None: if not index.is_unique: msg = "given index doesn't have a unique index" raise ValueError(msg) lhs_any = cast(Any, lhs) rhs_any = cast(Any, rhs) if isinstance( getattr(lhs_any, "_compliant_frame", None), PandasLikeDataFrame ) and isinstance(getattr(rhs_any, "_compliant_frame", None), PandasLikeDataFrame): _validate_index(lhs_any._compliant_frame._native_frame.index) _validate_index(rhs_any._compliant_frame._native_frame.index) return lhs_any._from_compliant_dataframe( # type: ignore[no-any-return] lhs_any._compliant_frame._from_native_frame( lhs_any._compliant_frame._native_frame.loc[ rhs_any._compliant_frame._native_frame.index ] ) ) if isinstance( getattr(lhs_any, "_compliant_frame", None), PandasLikeDataFrame ) and isinstance(getattr(rhs_any, "_compliant_series", None), PandasLikeSeries): _validate_index(lhs_any._compliant_frame._native_frame.index) _validate_index(rhs_any._compliant_series._native_series.index) return lhs_any._from_compliant_dataframe( # type: ignore[no-any-return] lhs_any._compliant_frame._from_native_frame( lhs_any._compliant_frame._native_frame.loc[ rhs_any._compliant_series._native_series.index ] ) ) if isinstance( getattr(lhs_any, "_compliant_series", None), PandasLikeSeries ) and isinstance(getattr(rhs_any, "_compliant_frame", None), PandasLikeDataFrame): _validate_index(lhs_any._compliant_series._native_series.index) _validate_index(rhs_any._compliant_frame._native_frame.index) return lhs_any._from_compliant_series( # type: ignore[no-any-return] lhs_any._compliant_series._from_native_series( lhs_any._compliant_series._native_series.loc[ rhs_any._compliant_frame._native_frame.index ] ) ) if isinstance( getattr(lhs_any, "_compliant_series", None), PandasLikeSeries ) and isinstance(getattr(rhs_any, "_compliant_series", None), PandasLikeSeries): _validate_index(lhs_any._compliant_series._native_series.index) _validate_index(rhs_any._compliant_series._native_series.index) return lhs_any._from_compliant_series( # type: ignore[no-any-return] lhs_any._compliant_series._from_native_series( lhs_any._compliant_series._native_series.loc[ rhs_any._compliant_series._native_series.index ] ) ) if len(lhs_any) != len(rhs_any): msg = f"Expected `lhs` and `rhs` to have the same length, got {len(lhs_any)} and {len(rhs_any)}" raise ValueError(msg) return lhs def maybe_get_index(obj: T) -> Any | None: """ Get the index of a DataFrame or a Series, if it's pandas-like. Notes: This is only really intended for backwards-compatibility purposes, for example if your library already aligns indices for users. If you're designing a new library, we highly encourage you to not rely on the Index. For non-pandas-like inputs, this returns `None`. Examples: >>> import pandas as pd >>> import polars as pl >>> import narwhals as nw >>> df_pd = pd.DataFrame({"a": [1, 2], "b": [4, 5]}) >>> df = nw.from_native(df_pd) >>> nw.maybe_get_index(df) RangeIndex(start=0, stop=2, step=1) >>> series_pd = pd.Series([1, 2]) >>> series = nw.from_native(series_pd, series_only=True) >>> nw.maybe_get_index(series) RangeIndex(start=0, stop=2, step=1) """ obj_any = cast(Any, obj) native_obj = to_native(obj_any) if is_pandas_like_dataframe(native_obj) or is_pandas_like_series(native_obj): return native_obj.index return None def maybe_set_index(df: T, column_names: str | list[str]) -> T: """ Set columns `columns` to be the index of `df`, if `df` is pandas-like. Notes: This is only really intended for backwards-compatibility purposes, for example if your library already aligns indices for users. If you're designing a new library, we highly encourage you to not rely on the Index. For non-pandas-like inputs, this is a no-op. Examples: >>> import pandas as pd >>> import polars as pl >>> import narwhals as nw >>> df_pd = pd.DataFrame({"a": [1, 2], "b": [4, 5]}) >>> df = nw.from_native(df_pd) >>> nw.to_native(nw.maybe_set_index(df, "b")) # doctest: +NORMALIZE_WHITESPACE a b 4 1 5 2 """ df_any = cast(Any, df) native_frame = to_native(df_any) if is_pandas_like_dataframe(native_frame): return df_any._from_compliant_dataframe( # type: ignore[no-any-return] df_any._compliant_frame._from_native_frame( native_frame.set_index(column_names) ) ) return df_any # type: ignore[no-any-return] def maybe_reset_index(obj: T) -> T: """ Reset the index to the default integer index of a DataFrame or a Series, if it's pandas-like. Notes: This is only really intended for backwards-compatibility purposes, for example if your library already resets the index for users. If you're designing a new library, we highly encourage you to not rely on the Index. For non-pandas-like inputs, this is a no-op. Examples: >>> import pandas as pd >>> import polars as pl >>> import narwhals as nw >>> df_pd = pd.DataFrame({"a": [1, 2], "b": [4, 5]}, index=([6, 7])) >>> df = nw.from_native(df_pd) >>> nw.to_native(nw.maybe_reset_index(df)) a b 0 1 4 1 2 5 >>> series_pd = pd.Series([1, 2]) >>> series = nw.from_native(series_pd, series_only=True) >>> nw.maybe_get_index(series) RangeIndex(start=0, stop=2, step=1) """ obj_any = cast(Any, obj) native_obj = to_native(obj_any) if is_pandas_like_dataframe(native_obj): native_namespace = obj_any.__native_namespace__() if _has_default_index(native_obj, native_namespace): return obj_any # type: ignore[no-any-return] return obj_any._from_compliant_dataframe( # type: ignore[no-any-return] obj_any._compliant_frame._from_native_frame(native_obj.reset_index(drop=True)) ) if is_pandas_like_series(native_obj): native_namespace = obj_any.__native_namespace__() if _has_default_index(native_obj, native_namespace): return obj_any # type: ignore[no-any-return] return obj_any._from_compliant_series( # type: ignore[no-any-return] obj_any._compliant_series._from_native_series( native_obj.reset_index(drop=True) ) ) return obj_any # type: ignore[no-any-return] def _has_default_index( native_frame_or_series: pd.Series | pd.DataFrame, native_namespace: Any ) -> bool: index = native_frame_or_series.index return ( isinstance(index, native_namespace.RangeIndex) and index.start == 0 and index.stop == len(index) and index.step == 1 ) def maybe_convert_dtypes(obj: T, *args: bool, **kwargs: bool | str) -> T: """ Convert columns or series to the best possible dtypes using dtypes supporting ``pd.NA``, if df is pandas-like. Arguments: obj: DataFrame or Series. *args: Additional arguments which gets passed through. **kwargs: Additional arguments which gets passed through. Notes: For non-pandas-like inputs, this is a no-op. Also, `args` and `kwargs` just get passed down to the underlying library as-is. Examples: >>> import pandas as pd >>> import polars as pl >>> import narwhals as nw >>> import numpy as np >>> df_pd = pd.DataFrame( ... { ... "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), ... "b": pd.Series([True, False, np.nan], dtype=np.dtype("O")), ... } ... ) >>> df = nw.from_native(df_pd) >>> nw.to_native(nw.maybe_convert_dtypes(df)).dtypes # doctest: +NORMALIZE_WHITESPACE a Int32 b boolean dtype: object """ obj_any = cast(Any, obj) native_obj = to_native(obj_any) if is_pandas_like_dataframe(native_obj): return obj_any._from_compliant_dataframe( # type: ignore[no-any-return] obj_any._compliant_frame._from_native_frame( native_obj.convert_dtypes(*args, **kwargs) ) ) if is_pandas_like_series(native_obj): return obj_any._from_compliant_series( # type: ignore[no-any-return] obj_any._compliant_series._from_native_series( native_obj.convert_dtypes(*args, **kwargs) ) ) return obj_any # type: ignore[no-any-return] def is_ordered_categorical(series: Series) -> bool: """ Return whether indices of categories are semantically meaningful. This is a convenience function to accessing what would otherwise be the `is_ordered` property from the DataFrame Interchange Protocol, see https://data-apis.org/dataframe-protocol/latest/API.html. - For Polars: - Enums are always ordered. - Categoricals are ordered if `dtype.ordering == "physical"`. - For pandas-like APIs: - Categoricals are ordered if `dtype.cat.ordered == True`. - For PyArrow table: - Categoricals are ordered if `dtype.type.ordered == True`. Examples: >>> import narwhals as nw >>> import pandas as pd >>> import polars as pl >>> data = ["x", "y"] >>> s_pd = pd.Series(data, dtype=pd.CategoricalDtype(ordered=True)) >>> s_pl = pl.Series(data, dtype=pl.Categorical(ordering="physical")) Let's define a library-agnostic function: >>> @nw.narwhalify ... def func(s): ... return nw.is_ordered_categorical(s) Then, we can pass any supported library to `func`: >>> func(s_pd) True >>> func(s_pl) True """ from narwhals._interchange.series import InterchangeSeries dtypes = series._compliant_series._dtypes if ( isinstance(series._compliant_series, InterchangeSeries) and series.dtype == dtypes.Categorical ): return series._compliant_series._native_series.describe_categorical[ # type: ignore[no-any-return] "is_ordered" ] if series.dtype == dtypes.Enum: return True if series.dtype != dtypes.Categorical: return False native_series = to_native(series) if is_polars_series(native_series): return native_series.dtype.ordering == "physical" # type: ignore[attr-defined, no-any-return] if is_pandas_series(native_series): return native_series.cat.ordered # type: ignore[no-any-return] if is_modin_series(native_series): # pragma: no cover return native_series.cat.ordered # type: ignore[no-any-return] if is_cudf_series(native_series): # pragma: no cover return native_series.cat.ordered # type: ignore[no-any-return] if is_pyarrow_chunked_array(native_series): return native_series.type.ordered # type: ignore[no-any-return] # If it doesn't match any of the above, let's just play it safe and return False. return False # pragma: no cover def generate_unique_token(n_bytes: int, columns: list[str]) -> str: # pragma: no cover warn( "Use `generate_temporary_column_name` instead. `generate_unique_token` is " "deprecated and it will be removed in future versions", DeprecationWarning, stacklevel=2, ) return generate_temporary_column_name(n_bytes=n_bytes, columns=columns) def generate_temporary_column_name(n_bytes: int, columns: list[str]) -> str: """Generates a unique token of specified `n_bytes` that is not present in the given list of columns. It relies on [python secrets token_hex](https://docs.python.org/3/library/secrets.html#secrets.token_hex) function to return a string nbytes random bytes. Arguments: n_bytes: The number of bytes to generate for the token. columns: The list of columns to check for uniqueness. Returns: A unique token that is not present in the given list of columns. Raises: AssertionError: If a unique token cannot be generated after 100 attempts. Examples: >>> import narwhals as nw >>> columns = ["abc", "xyz"] >>> nw.generate_temporary_column_name(n_bytes=8, columns=columns) not in columns True """ counter = 0 while True: token = token_hex(n_bytes) if token not in columns: return token counter += 1 if counter > 100: msg = ( "Internal Error: Narwhals was not able to generate a column name with " f"{n_bytes=} and not in {columns}" ) raise AssertionError(msg) def parse_columns_to_drop( compliant_frame: Any, columns: Iterable[str], strict: bool, # noqa: FBT001 ) -> list[str]: cols = set(compliant_frame.columns) to_drop = list(columns) if strict: for d in to_drop: if d not in cols: msg = f'"{d}" not found' raise ColumnNotFoundError(msg) else: to_drop = list(cols.intersection(set(to_drop))) return to_drop def is_sequence_but_not_str(sequence: Any) -> TypeGuard[Sequence[Any]]: return isinstance(sequence, Sequence) and not isinstance(sequence, str)