Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 141 additions & 0 deletions python/datafusion/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1894,6 +1894,15 @@ def approx_distinct(
Args:
expression: Values to check for distinct entries
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1, 1, 2, 3]})
>>> result = df.aggregate(
... [], [dfn.functions.approx_distinct(dfn.col("a")).alias("v")])
>>> result.collect_column("v")[0].as_py() >= 2
Comment on lines +1902 to +1904
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

>= 2 is a weak regression signal for a 4-row input with 3 distinct values.

Could we pick an input where the approximation is still deterministic enough to show a concrete answer, or at least tighten the expectation so the example documents the intended behavior more clearly?

True
"""
filter_raw = filter.expr if filter is not None else None

Expand All @@ -1912,6 +1921,15 @@ def approx_median(expression: Expr, filter: Expr | None = None) -> Expr:
Args:
expression: Values to find the median for
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
>>> result = df.aggregate(
... [], [dfn.functions.approx_median(dfn.col("a")).alias("v")])
>>> result.collect_column("v")[0].as_py()
2.0
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.approx_median(expression.expr, filter=filter_raw))
Expand Down Expand Up @@ -1943,6 +1961,15 @@ def approx_percentile_cont(
percentile: This must be between 0.0 and 1.0, inclusive
num_centroids: Max bin size for the t-digest algorithm
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0, 4.0, 5.0]})
>>> result = df.aggregate(
... [], [dfn.functions.approx_percentile_cont(dfn.col("a"), 0.5).alias("v")])
>>> result.collect_column("v")[0].as_py()
3.0
"""
sort_expr_raw = sort_or_default(sort_expression)
filter_raw = filter.expr if filter is not None else None
Expand Down Expand Up @@ -1975,6 +2002,15 @@ def approx_percentile_cont_with_weight(
num_centroids: Max bin size for the t-digest algorithm
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "w": [1.0, 1.0, 1.0]})
>>> result = df.aggregate(
... [], [dfn.functions.approx_percentile_cont_with_weight(dfn.col("a"),
... dfn.col("w"), 0.5).alias("v")])
>>> result.collect_column("v")[0].as_py()
2.0
"""
sort_expr_raw = sort_or_default(sort_expression)
filter_raw = filter.expr if filter is not None else None
Expand Down Expand Up @@ -2038,6 +2074,14 @@ def avg(
Args:
expression: Values to combine into an array
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
>>> result = df.aggregate([], [dfn.functions.avg(dfn.col("a")).alias("v")])
>>> result.collect_column("v")[0].as_py()
2.0
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.avg(expression.expr, filter=filter_raw))
Expand Down Expand Up @@ -2076,6 +2120,14 @@ def count(
expressions: Argument to perform bitwise calculation on
distinct: If True, a single entry for each distinct value will be in the result
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1, 2, 3]})
>>> result = df.aggregate([], [dfn.functions.count(dfn.col("a")).alias("v")])
>>> result.collect_column("v")[0].as_py()
3
"""
filter_raw = filter.expr if filter is not None else None

Expand Down Expand Up @@ -2140,6 +2192,14 @@ def max(expression: Expr, filter: Expr | None = None) -> Expr:
Args:
expression: The value to find the maximum of
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1, 2, 3]})
>>> result = df.aggregate([], [dfn.functions.max(dfn.col("a")).alias("v")])
>>> result.collect_column("v")[0].as_py()
3
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.max(expression.expr, filter=filter_raw))
Expand All @@ -2149,6 +2209,14 @@ def mean(expression: Expr, filter: Expr | None = None) -> Expr:
"""Returns the average (mean) value of the argument.

This is an alias for :py:func:`avg`.

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
>>> result = df.aggregate([], [dfn.functions.mean(dfn.col("a")).alias("v")])
>>> result.collect_column("v")[0].as_py()
2.0
"""
return avg(expression, filter)

Expand All @@ -2168,6 +2236,14 @@ def median(
expression: The value to compute the median of
distinct: If True, a single entry for each distinct value will be in the result
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
>>> result = df.aggregate([], [dfn.functions.median(dfn.col("a")).alias("v")])
>>> result.collect_column("v")[0].as_py()
2.0
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.median(expression.expr, distinct=distinct, filter=filter_raw))
Expand All @@ -2182,6 +2258,14 @@ def min(expression: Expr, filter: Expr | None = None) -> Expr:
Args:
expression: The value to find the minimum of
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1, 2, 3]})
>>> result = df.aggregate([], [dfn.functions.min(dfn.col("a")).alias("v")])
>>> result.collect_column("v")[0].as_py()
1
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.min(expression.expr, filter=filter_raw))
Expand All @@ -2201,6 +2285,14 @@ def sum(
Args:
expression: Values to combine into an array
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1, 2, 3]})
>>> result = df.aggregate([], [dfn.functions.sum(dfn.col("a")).alias("v")])
>>> result.collect_column("v")[0].as_py()
6
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.sum(expression.expr, filter=filter_raw))
Expand Down Expand Up @@ -2618,6 +2710,14 @@ def bit_and(expression: Expr, filter: Expr | None = None) -> Expr:
Args:
expression: Argument to perform bitwise calculation on
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [7, 3]})
>>> result = df.aggregate([], [dfn.functions.bit_and(dfn.col("a")).alias("v")])
>>> result.collect_column("v")[0].as_py()
3
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.bit_and(expression.expr, filter=filter_raw))
Expand All @@ -2634,6 +2734,14 @@ def bit_or(expression: Expr, filter: Expr | None = None) -> Expr:
Args:
expression: Argument to perform bitwise calculation on
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1, 2]})
>>> result = df.aggregate([], [dfn.functions.bit_or(dfn.col("a")).alias("v")])
>>> result.collect_column("v")[0].as_py()
3
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.bit_or(expression.expr, filter=filter_raw))
Expand All @@ -2653,6 +2761,14 @@ def bit_xor(
expression: Argument to perform bitwise calculation on
distinct: If True, evaluate each unique value of expression only once
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [5, 3]})
>>> result = df.aggregate([], [dfn.functions.bit_xor(dfn.col("a")).alias("v")])
>>> result.collect_column("v")[0].as_py()
6
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.bit_xor(expression.expr, distinct=distinct, filter=filter_raw))
Expand All @@ -2670,6 +2786,14 @@ def bool_and(expression: Expr, filter: Expr | None = None) -> Expr:
Args:
expression: Argument to perform calculation on
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [True, True, False]})
>>> result = df.aggregate([], [dfn.functions.bool_and(dfn.col("a")).alias("v")])
>>> result.collect_column("v")[0].as_py()
False
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.bool_and(expression.expr, filter=filter_raw))
Expand All @@ -2687,6 +2811,14 @@ def bool_or(expression: Expr, filter: Expr | None = None) -> Expr:
Args:
expression: Argument to perform calculation on
filter: If provided, only compute against rows for which the filter is True

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [False, False, True]})
>>> result = df.aggregate([], [dfn.functions.bool_or(dfn.col("a")).alias("v")])
>>> result.collect_column("v")[0].as_py()
True
"""
filter_raw = filter.expr if filter is not None else None
return Expr(f.bool_or(expression.expr, filter=filter_raw))
Expand Down Expand Up @@ -3077,6 +3209,15 @@ def string_agg(
For example::

df.aggregate([], string_agg(col("a"), ",", order_by="b"))

Examples:
---------
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": ["x", "y", "z"]})
>>> result = df.aggregate(
... [], [dfn.functions.string_agg(dfn.col("a"), ",", order_by="a").alias("s")])
>>> result.collect_column("s")[0].as_py()
'x,y,z'
"""
order_by_raw = sort_list_to_raw_sort_list(order_by)
filter_raw = filter.expr if filter is not None else None
Expand Down
Loading