Files
Buffteks-Website/streamlit-venv/lib/python3.10/site-packages/narwhals/_expression_parsing.py
2025-01-10 21:40:35 +00:00

333 lines
11 KiB
Python
Executable File

# Utilities for expression parsing
# Useful for backends which don't have any concept of expressions, such
# and pandas or PyArrow.
from __future__ import annotations
from copy import copy
from typing import TYPE_CHECKING
from typing import Any
from typing import Sequence
from typing import TypeVar
from typing import Union
from typing import cast
from typing import overload
from narwhals.dependencies import is_numpy_array
if TYPE_CHECKING:
from narwhals._arrow.dataframe import ArrowDataFrame
from narwhals._arrow.expr import ArrowExpr
from narwhals._arrow.namespace import ArrowNamespace
from narwhals._arrow.series import ArrowSeries
from narwhals._arrow.typing import IntoArrowExpr
from narwhals._dask.dataframe import DaskLazyFrame
from narwhals._dask.expr import DaskExpr
from narwhals._dask.namespace import DaskNamespace
from narwhals._dask.typing import IntoDaskExpr
from narwhals._pandas_like.dataframe import PandasLikeDataFrame
from narwhals._pandas_like.expr import PandasLikeExpr
from narwhals._pandas_like.namespace import PandasLikeNamespace
from narwhals._pandas_like.series import PandasLikeSeries
from narwhals._pandas_like.typing import IntoPandasLikeExpr
from narwhals._polars.expr import PolarsExpr
from narwhals._polars.namespace import PolarsNamespace
from narwhals._polars.series import PolarsSeries
from narwhals._polars.typing import IntoPolarsExpr
CompliantNamespace = Union[
PandasLikeNamespace, ArrowNamespace, DaskNamespace, PolarsNamespace
]
CompliantExpr = Union[PandasLikeExpr, ArrowExpr, DaskExpr, PolarsExpr]
IntoCompliantExpr = Union[
IntoPandasLikeExpr, IntoArrowExpr, IntoDaskExpr, IntoPolarsExpr
]
IntoCompliantExprT = TypeVar("IntoCompliantExprT", bound=IntoCompliantExpr)
CompliantExprT = TypeVar("CompliantExprT", bound=CompliantExpr)
CompliantSeries = Union[PandasLikeSeries, ArrowSeries, PolarsSeries]
ListOfCompliantSeries = Union[
list[PandasLikeSeries], list[ArrowSeries], list[DaskExpr], list[PolarsSeries]
]
ListOfCompliantExpr = Union[
list[PandasLikeExpr], list[ArrowExpr], list[DaskExpr], list[PolarsExpr]
]
CompliantDataFrame = Union[PandasLikeDataFrame, ArrowDataFrame, DaskLazyFrame]
T = TypeVar("T")
def evaluate_into_expr(
df: CompliantDataFrame, into_expr: IntoCompliantExpr
) -> ListOfCompliantSeries:
"""Return list of raw columns."""
expr = parse_into_expr(into_expr, namespace=df.__narwhals_namespace__())
return expr._call(df) # type: ignore[arg-type]
@overload
def evaluate_into_exprs(
df: PandasLikeDataFrame,
*exprs: IntoPandasLikeExpr,
**named_exprs: IntoPandasLikeExpr,
) -> list[PandasLikeSeries]: ...
@overload
def evaluate_into_exprs(
df: ArrowDataFrame,
*exprs: IntoArrowExpr,
**named_exprs: IntoArrowExpr,
) -> list[ArrowSeries]: ...
@overload
def evaluate_into_exprs(
df: DaskLazyFrame,
*exprs: IntoDaskExpr,
**named_exprs: IntoDaskExpr,
) -> list[DaskExpr]: ...
def evaluate_into_exprs(
df: CompliantDataFrame,
*exprs: IntoCompliantExprT,
**named_exprs: IntoCompliantExprT,
) -> ListOfCompliantSeries:
"""Evaluate each expr into Series."""
series: ListOfCompliantSeries = [ # type: ignore[assignment]
item
for sublist in (evaluate_into_expr(df, into_expr) for into_expr in exprs)
for item in sublist
]
for name, expr in named_exprs.items():
evaluated_expr = evaluate_into_expr(df, expr)
if len(evaluated_expr) > 1:
msg = "Named expressions must return a single column" # pragma: no cover
raise AssertionError(msg)
to_append = evaluated_expr[0].alias(name)
series.append(to_append) # type: ignore[arg-type]
return series
def maybe_evaluate_expr(
df: CompliantDataFrame, expr: CompliantExpr | T
) -> ListOfCompliantSeries | T:
"""Evaluate `expr` if it's an expression, otherwise return it as is."""
if hasattr(expr, "__narwhals_expr__"):
expr = cast("CompliantExpr", expr)
return expr._call(df) # type: ignore[arg-type]
return expr
@overload
def parse_into_exprs(
*exprs: IntoPandasLikeExpr,
namespace: PandasLikeNamespace,
**named_exprs: IntoPandasLikeExpr,
) -> list[PandasLikeExpr]: ...
@overload
def parse_into_exprs(
*exprs: IntoArrowExpr,
namespace: ArrowNamespace,
**named_exprs: IntoArrowExpr,
) -> list[ArrowExpr]: ...
@overload
def parse_into_exprs(
*exprs: IntoDaskExpr,
namespace: DaskNamespace,
**named_exprs: IntoDaskExpr,
) -> list[DaskExpr]: ...
@overload
def parse_into_exprs(
*exprs: IntoPolarsExpr,
namespace: PolarsNamespace,
**named_exprs: IntoPolarsExpr,
) -> list[PolarsExpr]: ...
def parse_into_exprs(
*exprs: IntoCompliantExpr,
namespace: CompliantNamespace,
**named_exprs: IntoCompliantExpr,
) -> ListOfCompliantExpr:
"""Parse each input as an expression (if it's not already one). See `parse_into_expr` for
more details."""
return [parse_into_expr(into_expr, namespace=namespace) for into_expr in exprs] + [
parse_into_expr(expr, namespace=namespace).alias(name)
for name, expr in named_exprs.items()
]
def parse_into_expr(
into_expr: IntoCompliantExpr,
*,
namespace: CompliantNamespace,
) -> CompliantExpr:
"""Parse `into_expr` as an expression.
For example, in Polars, we can do both `df.select('a')` and `df.select(pl.col('a'))`.
We do the same in Narwhals:
- if `into_expr` is already an expression, just return it
- if it's a Series, then convert it to an expression
- if it's a numpy array, then convert it to a Series and then to an expression
- if it's a string, then convert it to an expression
- else, raise
"""
if hasattr(into_expr, "__narwhals_expr__"):
return into_expr # type: ignore[return-value]
if hasattr(into_expr, "__narwhals_series__"):
return namespace._create_expr_from_series(into_expr) # type: ignore[arg-type]
if isinstance(into_expr, str):
return namespace.col(into_expr)
if is_numpy_array(into_expr):
series = namespace._create_compliant_series(into_expr)
return namespace._create_expr_from_series(series) # type: ignore[arg-type]
msg = (
f"Expected an object which can be converted into an expression, got {type(into_expr)}\n\n" # pragma: no cover
"Hint: if you were trying to select a column which does not have a string column name, then "
"you should explicitly use `nw.col`.\nFor example, `df.select(nw.col(0))` if you have a column "
"named `0`."
)
raise TypeError(msg)
def reuse_series_implementation(
expr: CompliantExprT,
attr: str,
*args: Any,
returns_scalar: bool = False,
**kwargs: Any,
) -> CompliantExprT:
"""Reuse Series implementation for expression.
If Series.foo is already defined, and we'd like Expr.foo to be the same, we can
leverage this method to do that for us.
Arguments
expr: expression object.
attr: name of method.
returns_scalar: whether the Series version returns a scalar. In this case,
the expression version should return a 1-row Series.
args, kwargs: arguments and keyword arguments to pass to function.
"""
plx = expr.__narwhals_namespace__()
def func(df: CompliantDataFrame) -> list[CompliantSeries]:
_args = [maybe_evaluate_expr(df, arg) for arg in args]
_kwargs = {
arg_name: maybe_evaluate_expr(df, arg_value)
for arg_name, arg_value in kwargs.items()
}
out: list[CompliantSeries] = [
plx._create_series_from_scalar(
getattr(column, attr)(*_args, **_kwargs),
column, # type: ignore[arg-type]
)
if returns_scalar
else getattr(column, attr)(*_args, **_kwargs)
for column in expr._call(df) # type: ignore[arg-type]
]
if expr._output_names is not None and (
[s.name for s in out] != expr._output_names
): # pragma: no cover
msg = "Safety assertion failed, please report a bug to https://github.com/narwhals-dev/narwhals/issues"
raise AssertionError(msg)
return out
# Try tracking root and output names by combining them from all
# expressions appearing in args and kwargs. If any anonymous
# expression appears (e.g. nw.all()), then give up on tracking root names
# and just set it to None.
root_names = copy(expr._root_names)
output_names = expr._output_names
for arg in list(args) + list(kwargs.values()):
if root_names is not None and isinstance(arg, expr.__class__):
if arg._root_names is not None:
root_names.extend(arg._root_names)
else:
root_names = None
output_names = None
break
elif root_names is None:
output_names = None
break
if not (
(output_names is None and root_names is None)
or (output_names is not None and root_names is not None)
): # pragma: no cover
msg = "Safety assertion failed, please report a bug to https://github.com/narwhals-dev/narwhals/issues"
raise AssertionError(msg)
return plx._create_expr_from_callable( # type: ignore[return-value]
func, # type: ignore[arg-type]
depth=expr._depth + 1,
function_name=f"{expr._function_name}->{attr}",
root_names=root_names,
output_names=output_names,
)
def reuse_series_namespace_implementation(
expr: CompliantExprT, series_namespace: str, attr: str, *args: Any, **kwargs: Any
) -> CompliantExprT:
"""Just like `reuse_series_implementation`, but for e.g. `Expr.dt.foo` instead
of `Expr.foo`.
"""
plx = expr.__narwhals_namespace__()
return plx._create_expr_from_callable( # type: ignore[return-value]
lambda df: [
getattr(getattr(series, series_namespace), attr)(*args, **kwargs)
for series in expr._call(df) # type: ignore[arg-type]
],
depth=expr._depth + 1,
function_name=f"{expr._function_name}->{series_namespace}.{attr}",
root_names=expr._root_names,
output_names=expr._output_names,
)
def is_simple_aggregation(expr: CompliantExpr) -> bool:
"""
Check if expr is a very simple one, such as:
- nw.col('a').mean() # depth 1
- nw.mean('a') # depth 1
- nw.len() # depth 0
as opposed to, say
- nw.col('a').filter(nw.col('b')>nw.col('c')).max()
because then, we can use a fastpath in pandas.
"""
return expr._depth < 2
def combine_root_names(parsed_exprs: Sequence[CompliantExpr]) -> list[str] | None:
root_names = copy(parsed_exprs[0]._root_names)
for arg in parsed_exprs[1:]:
if root_names is not None and hasattr(arg, "__narwhals_expr__"):
if arg._root_names is not None:
root_names.extend(arg._root_names)
else:
root_names = None
break
return root_names
def reduce_output_names(parsed_exprs: Sequence[CompliantExpr]) -> list[str] | None:
"""Returns the left-most output name"""
return (
parsed_exprs[0]._output_names[:1]
if parsed_exprs[0]._output_names is not None
else None
)