333 lines
11 KiB
Python
Executable File
333 lines
11 KiB
Python
Executable File
# Utilities for expression parsing
|
|
# Useful for backends which don't have any concept of expressions, such
|
|
# and pandas or PyArrow.
|
|
from __future__ import annotations
|
|
|
|
from copy import copy
|
|
from typing import TYPE_CHECKING
|
|
from typing import Any
|
|
from typing import Sequence
|
|
from typing import TypeVar
|
|
from typing import Union
|
|
from typing import cast
|
|
from typing import overload
|
|
|
|
from narwhals.dependencies import is_numpy_array
|
|
|
|
if TYPE_CHECKING:
|
|
from narwhals._arrow.dataframe import ArrowDataFrame
|
|
from narwhals._arrow.expr import ArrowExpr
|
|
from narwhals._arrow.namespace import ArrowNamespace
|
|
from narwhals._arrow.series import ArrowSeries
|
|
from narwhals._arrow.typing import IntoArrowExpr
|
|
from narwhals._dask.dataframe import DaskLazyFrame
|
|
from narwhals._dask.expr import DaskExpr
|
|
from narwhals._dask.namespace import DaskNamespace
|
|
from narwhals._dask.typing import IntoDaskExpr
|
|
from narwhals._pandas_like.dataframe import PandasLikeDataFrame
|
|
from narwhals._pandas_like.expr import PandasLikeExpr
|
|
from narwhals._pandas_like.namespace import PandasLikeNamespace
|
|
from narwhals._pandas_like.series import PandasLikeSeries
|
|
from narwhals._pandas_like.typing import IntoPandasLikeExpr
|
|
from narwhals._polars.expr import PolarsExpr
|
|
from narwhals._polars.namespace import PolarsNamespace
|
|
from narwhals._polars.series import PolarsSeries
|
|
from narwhals._polars.typing import IntoPolarsExpr
|
|
|
|
CompliantNamespace = Union[
|
|
PandasLikeNamespace, ArrowNamespace, DaskNamespace, PolarsNamespace
|
|
]
|
|
CompliantExpr = Union[PandasLikeExpr, ArrowExpr, DaskExpr, PolarsExpr]
|
|
IntoCompliantExpr = Union[
|
|
IntoPandasLikeExpr, IntoArrowExpr, IntoDaskExpr, IntoPolarsExpr
|
|
]
|
|
IntoCompliantExprT = TypeVar("IntoCompliantExprT", bound=IntoCompliantExpr)
|
|
CompliantExprT = TypeVar("CompliantExprT", bound=CompliantExpr)
|
|
CompliantSeries = Union[PandasLikeSeries, ArrowSeries, PolarsSeries]
|
|
ListOfCompliantSeries = Union[
|
|
list[PandasLikeSeries], list[ArrowSeries], list[DaskExpr], list[PolarsSeries]
|
|
]
|
|
ListOfCompliantExpr = Union[
|
|
list[PandasLikeExpr], list[ArrowExpr], list[DaskExpr], list[PolarsExpr]
|
|
]
|
|
CompliantDataFrame = Union[PandasLikeDataFrame, ArrowDataFrame, DaskLazyFrame]
|
|
|
|
T = TypeVar("T")
|
|
|
|
|
|
def evaluate_into_expr(
|
|
df: CompliantDataFrame, into_expr: IntoCompliantExpr
|
|
) -> ListOfCompliantSeries:
|
|
"""Return list of raw columns."""
|
|
expr = parse_into_expr(into_expr, namespace=df.__narwhals_namespace__())
|
|
return expr._call(df) # type: ignore[arg-type]
|
|
|
|
|
|
@overload
|
|
def evaluate_into_exprs(
|
|
df: PandasLikeDataFrame,
|
|
*exprs: IntoPandasLikeExpr,
|
|
**named_exprs: IntoPandasLikeExpr,
|
|
) -> list[PandasLikeSeries]: ...
|
|
|
|
|
|
@overload
|
|
def evaluate_into_exprs(
|
|
df: ArrowDataFrame,
|
|
*exprs: IntoArrowExpr,
|
|
**named_exprs: IntoArrowExpr,
|
|
) -> list[ArrowSeries]: ...
|
|
|
|
|
|
@overload
|
|
def evaluate_into_exprs(
|
|
df: DaskLazyFrame,
|
|
*exprs: IntoDaskExpr,
|
|
**named_exprs: IntoDaskExpr,
|
|
) -> list[DaskExpr]: ...
|
|
|
|
|
|
def evaluate_into_exprs(
|
|
df: CompliantDataFrame,
|
|
*exprs: IntoCompliantExprT,
|
|
**named_exprs: IntoCompliantExprT,
|
|
) -> ListOfCompliantSeries:
|
|
"""Evaluate each expr into Series."""
|
|
series: ListOfCompliantSeries = [ # type: ignore[assignment]
|
|
item
|
|
for sublist in (evaluate_into_expr(df, into_expr) for into_expr in exprs)
|
|
for item in sublist
|
|
]
|
|
for name, expr in named_exprs.items():
|
|
evaluated_expr = evaluate_into_expr(df, expr)
|
|
if len(evaluated_expr) > 1:
|
|
msg = "Named expressions must return a single column" # pragma: no cover
|
|
raise AssertionError(msg)
|
|
to_append = evaluated_expr[0].alias(name)
|
|
series.append(to_append) # type: ignore[arg-type]
|
|
return series
|
|
|
|
|
|
def maybe_evaluate_expr(
|
|
df: CompliantDataFrame, expr: CompliantExpr | T
|
|
) -> ListOfCompliantSeries | T:
|
|
"""Evaluate `expr` if it's an expression, otherwise return it as is."""
|
|
if hasattr(expr, "__narwhals_expr__"):
|
|
expr = cast("CompliantExpr", expr)
|
|
return expr._call(df) # type: ignore[arg-type]
|
|
return expr
|
|
|
|
|
|
@overload
|
|
def parse_into_exprs(
|
|
*exprs: IntoPandasLikeExpr,
|
|
namespace: PandasLikeNamespace,
|
|
**named_exprs: IntoPandasLikeExpr,
|
|
) -> list[PandasLikeExpr]: ...
|
|
|
|
|
|
@overload
|
|
def parse_into_exprs(
|
|
*exprs: IntoArrowExpr,
|
|
namespace: ArrowNamespace,
|
|
**named_exprs: IntoArrowExpr,
|
|
) -> list[ArrowExpr]: ...
|
|
|
|
|
|
@overload
|
|
def parse_into_exprs(
|
|
*exprs: IntoDaskExpr,
|
|
namespace: DaskNamespace,
|
|
**named_exprs: IntoDaskExpr,
|
|
) -> list[DaskExpr]: ...
|
|
|
|
|
|
@overload
|
|
def parse_into_exprs(
|
|
*exprs: IntoPolarsExpr,
|
|
namespace: PolarsNamespace,
|
|
**named_exprs: IntoPolarsExpr,
|
|
) -> list[PolarsExpr]: ...
|
|
|
|
|
|
def parse_into_exprs(
|
|
*exprs: IntoCompliantExpr,
|
|
namespace: CompliantNamespace,
|
|
**named_exprs: IntoCompliantExpr,
|
|
) -> ListOfCompliantExpr:
|
|
"""Parse each input as an expression (if it's not already one). See `parse_into_expr` for
|
|
more details."""
|
|
return [parse_into_expr(into_expr, namespace=namespace) for into_expr in exprs] + [
|
|
parse_into_expr(expr, namespace=namespace).alias(name)
|
|
for name, expr in named_exprs.items()
|
|
]
|
|
|
|
|
|
def parse_into_expr(
|
|
into_expr: IntoCompliantExpr,
|
|
*,
|
|
namespace: CompliantNamespace,
|
|
) -> CompliantExpr:
|
|
"""Parse `into_expr` as an expression.
|
|
|
|
For example, in Polars, we can do both `df.select('a')` and `df.select(pl.col('a'))`.
|
|
We do the same in Narwhals:
|
|
|
|
- if `into_expr` is already an expression, just return it
|
|
- if it's a Series, then convert it to an expression
|
|
- if it's a numpy array, then convert it to a Series and then to an expression
|
|
- if it's a string, then convert it to an expression
|
|
- else, raise
|
|
"""
|
|
if hasattr(into_expr, "__narwhals_expr__"):
|
|
return into_expr # type: ignore[return-value]
|
|
if hasattr(into_expr, "__narwhals_series__"):
|
|
return namespace._create_expr_from_series(into_expr) # type: ignore[arg-type]
|
|
if isinstance(into_expr, str):
|
|
return namespace.col(into_expr)
|
|
if is_numpy_array(into_expr):
|
|
series = namespace._create_compliant_series(into_expr)
|
|
return namespace._create_expr_from_series(series) # type: ignore[arg-type]
|
|
msg = (
|
|
f"Expected an object which can be converted into an expression, got {type(into_expr)}\n\n" # pragma: no cover
|
|
"Hint: if you were trying to select a column which does not have a string column name, then "
|
|
"you should explicitly use `nw.col`.\nFor example, `df.select(nw.col(0))` if you have a column "
|
|
"named `0`."
|
|
)
|
|
raise TypeError(msg)
|
|
|
|
|
|
def reuse_series_implementation(
|
|
expr: CompliantExprT,
|
|
attr: str,
|
|
*args: Any,
|
|
returns_scalar: bool = False,
|
|
**kwargs: Any,
|
|
) -> CompliantExprT:
|
|
"""Reuse Series implementation for expression.
|
|
|
|
If Series.foo is already defined, and we'd like Expr.foo to be the same, we can
|
|
leverage this method to do that for us.
|
|
|
|
Arguments
|
|
expr: expression object.
|
|
attr: name of method.
|
|
returns_scalar: whether the Series version returns a scalar. In this case,
|
|
the expression version should return a 1-row Series.
|
|
args, kwargs: arguments and keyword arguments to pass to function.
|
|
"""
|
|
plx = expr.__narwhals_namespace__()
|
|
|
|
def func(df: CompliantDataFrame) -> list[CompliantSeries]:
|
|
_args = [maybe_evaluate_expr(df, arg) for arg in args]
|
|
_kwargs = {
|
|
arg_name: maybe_evaluate_expr(df, arg_value)
|
|
for arg_name, arg_value in kwargs.items()
|
|
}
|
|
|
|
out: list[CompliantSeries] = [
|
|
plx._create_series_from_scalar(
|
|
getattr(column, attr)(*_args, **_kwargs),
|
|
column, # type: ignore[arg-type]
|
|
)
|
|
if returns_scalar
|
|
else getattr(column, attr)(*_args, **_kwargs)
|
|
for column in expr._call(df) # type: ignore[arg-type]
|
|
]
|
|
if expr._output_names is not None and (
|
|
[s.name for s in out] != expr._output_names
|
|
): # pragma: no cover
|
|
msg = "Safety assertion failed, please report a bug to https://github.com/narwhals-dev/narwhals/issues"
|
|
raise AssertionError(msg)
|
|
return out
|
|
|
|
# Try tracking root and output names by combining them from all
|
|
# expressions appearing in args and kwargs. If any anonymous
|
|
# expression appears (e.g. nw.all()), then give up on tracking root names
|
|
# and just set it to None.
|
|
root_names = copy(expr._root_names)
|
|
output_names = expr._output_names
|
|
for arg in list(args) + list(kwargs.values()):
|
|
if root_names is not None and isinstance(arg, expr.__class__):
|
|
if arg._root_names is not None:
|
|
root_names.extend(arg._root_names)
|
|
else:
|
|
root_names = None
|
|
output_names = None
|
|
break
|
|
elif root_names is None:
|
|
output_names = None
|
|
break
|
|
|
|
if not (
|
|
(output_names is None and root_names is None)
|
|
or (output_names is not None and root_names is not None)
|
|
): # pragma: no cover
|
|
msg = "Safety assertion failed, please report a bug to https://github.com/narwhals-dev/narwhals/issues"
|
|
raise AssertionError(msg)
|
|
|
|
return plx._create_expr_from_callable( # type: ignore[return-value]
|
|
func, # type: ignore[arg-type]
|
|
depth=expr._depth + 1,
|
|
function_name=f"{expr._function_name}->{attr}",
|
|
root_names=root_names,
|
|
output_names=output_names,
|
|
)
|
|
|
|
|
|
def reuse_series_namespace_implementation(
|
|
expr: CompliantExprT, series_namespace: str, attr: str, *args: Any, **kwargs: Any
|
|
) -> CompliantExprT:
|
|
"""Just like `reuse_series_implementation`, but for e.g. `Expr.dt.foo` instead
|
|
of `Expr.foo`.
|
|
"""
|
|
plx = expr.__narwhals_namespace__()
|
|
return plx._create_expr_from_callable( # type: ignore[return-value]
|
|
lambda df: [
|
|
getattr(getattr(series, series_namespace), attr)(*args, **kwargs)
|
|
for series in expr._call(df) # type: ignore[arg-type]
|
|
],
|
|
depth=expr._depth + 1,
|
|
function_name=f"{expr._function_name}->{series_namespace}.{attr}",
|
|
root_names=expr._root_names,
|
|
output_names=expr._output_names,
|
|
)
|
|
|
|
|
|
def is_simple_aggregation(expr: CompliantExpr) -> bool:
|
|
"""
|
|
Check if expr is a very simple one, such as:
|
|
|
|
- nw.col('a').mean() # depth 1
|
|
- nw.mean('a') # depth 1
|
|
- nw.len() # depth 0
|
|
|
|
as opposed to, say
|
|
|
|
- nw.col('a').filter(nw.col('b')>nw.col('c')).max()
|
|
|
|
because then, we can use a fastpath in pandas.
|
|
"""
|
|
return expr._depth < 2
|
|
|
|
|
|
def combine_root_names(parsed_exprs: Sequence[CompliantExpr]) -> list[str] | None:
|
|
root_names = copy(parsed_exprs[0]._root_names)
|
|
for arg in parsed_exprs[1:]:
|
|
if root_names is not None and hasattr(arg, "__narwhals_expr__"):
|
|
if arg._root_names is not None:
|
|
root_names.extend(arg._root_names)
|
|
else:
|
|
root_names = None
|
|
break
|
|
return root_names
|
|
|
|
|
|
def reduce_output_names(parsed_exprs: Sequence[CompliantExpr]) -> list[str] | None:
|
|
"""Returns the left-most output name"""
|
|
return (
|
|
parsed_exprs[0]._output_names[:1]
|
|
if parsed_exprs[0]._output_names is not None
|
|
else None
|
|
)
|