From 8582e49cb1764eadccee89f240c8e3c45388124c Mon Sep 17 00:00:00 2001 From: ntjohnson1 <24689722+ntjohnson1@users.noreply.github.com> Date: Thu, 5 Mar 2026 14:58:20 -0500 Subject: [PATCH 1/6] Add docstring examples for Scalar regex, crypto, struct and other functions Add example usage to docstrings for Scalar regex, crypto, struct and other functions to improve documentation. Co-Authored-By: Claude Opus 4.6 --- python/datafusion/functions.py | 220 ++++++++++++++++++++++++++++++--- 1 file changed, 204 insertions(+), 16 deletions(-) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index fd116254b..0b5bb702b 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -637,7 +637,17 @@ def chr(arg: Expr) -> Expr: def coalesce(*args: Expr) -> Expr: - """Returns the value of the first expr in ``args`` which is not NULL.""" + """Returns the value of the first expr in ``args`` which is not NULL. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [None, 1], "b": [2, 3]}) + >>> result = df.select( + ... dfn.functions.coalesce(dfn.col("a"), dfn.col("b")).alias("c")) + >>> result.collect_column("c")[0].as_py() + 2 + """ args = [arg.expr for arg in args] return Expr(f.coalesce(*args)) @@ -820,7 +830,16 @@ def ltrim(arg: Expr) -> Expr: def md5(arg: Expr) -> Expr: - """Computes an MD5 128-bit checksum for a string expression.""" + """Computes an MD5 128-bit checksum for a string expression. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select(dfn.functions.md5(dfn.col("a")).alias("md5")) + >>> result.collect_column("md5")[0].as_py() + '5d41402abc4b2a76b9719d911017c592' + """ return Expr(f.md5(arg.expr)) @@ -830,7 +849,18 @@ def nanvl(x: Expr, y: Expr) -> Expr: def nvl(x: Expr, y: Expr) -> Expr: - """Returns ``x`` if ``x`` is not ``NULL``. Otherwise returns ``y``.""" + """Returns ``x`` if ``x`` is not ``NULL``. Otherwise returns ``y``. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [None, 1], "b": [0, 0]}) + >>> nvl_df = df.select(dfn.functions.nvl(dfn.col("a"), dfn.col("b")).alias("nvl")) + >>> nvl_df.collect_column("nvl")[0].as_py() + 0 + >>> nvl_df.collect_column("nvl")[1].as_py() + 1 + """ return Expr(f.nvl(x.expr, y.expr)) @@ -899,10 +929,22 @@ def radians(arg: Expr) -> Expr: def regexp_like(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr: - """Find if any regular expression (regex) matches exist. + r"""Find if any regular expression (regex) matches exist. Tests a string using a regular expression returning true if at least one match, false otherwise. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello123"]}) + >>> result = df.select( + ... dfn.functions.regexp_like( + ... dfn.col("a"), dfn.lit("\\d+") + ... ).alias("m") + ... ) + >>> result.collect_column("m")[0].as_py() + True """ if flags is not None: flags = flags.expr @@ -910,10 +952,22 @@ def regexp_like(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr: def regexp_match(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr: - """Perform regular expression (regex) matching. + r"""Perform regular expression (regex) matching. Returns an array with each element containing the leftmost-first match of the corresponding index in ``regex`` to string in ``string``. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello 42 world"]}) + >>> result = df.select( + ... dfn.functions.regexp_match( + ... dfn.col("a"), dfn.lit("(\\d+)") + ... ).alias("m") + ... ) + >>> result.collect_column("m")[0].as_py() + ['42'] """ if flags is not None: flags = flags.expr @@ -923,13 +977,26 @@ def regexp_match(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr: def regexp_replace( string: Expr, pattern: Expr, replacement: Expr, flags: Expr | None = None ) -> Expr: - """Replaces substring(s) matching a PCRE-like regular expression. + r"""Replaces substring(s) matching a PCRE-like regular expression. The full list of supported features and syntax can be found at Supported flags with the addition of 'g' can be found at + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello 42"]}) + >>> result = df.select( + ... dfn.functions.regexp_replace( + ... dfn.col("a"), dfn.lit("\\d+"), + ... dfn.lit("XX") + ... ).alias("r") + ... ) + >>> result.collect_column("r")[0].as_py() + 'hello XX' """ if flags is not None: flags = flags.expr @@ -943,6 +1010,15 @@ def regexp_count( Optional start position (the first position is 1) to search for the regular expression. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["abcabc"]}) + >>> result = df.select( + ... dfn.functions.regexp_count(dfn.col("a"), dfn.lit("abc")).alias("c")) + >>> result.collect_column("c")[0].as_py() + 2 """ if flags is not None: flags = flags.expr @@ -958,12 +1034,24 @@ def regexp_instr( flags: Expr | None = None, sub_expr: Expr | None = None, ) -> Expr: - """Returns the position of a regular expression match in a string. + r"""Returns the position of a regular expression match in a string. Searches ``values`` for the ``n``-th occurrence of ``regex``, starting at position ``start`` (the first position is 1). Returns the starting or ending position based on ``end_position``. Use ``flags`` to control regex behavior and ``sub_expr`` to return the position of a specific capture group instead of the entire match. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello 42 world"]}) + >>> result = df.select( + ... dfn.functions.regexp_instr( + ... dfn.col("a"), dfn.lit("\\d+") + ... ).alias("pos") + ... ) + >>> result.collect_column("pos")[0].as_py() + 7 """ start = start.expr if start is not None else None n = n.expr if n is not None else None @@ -1030,22 +1118,66 @@ def rtrim(arg: Expr) -> Expr: def sha224(arg: Expr) -> Expr: - """Computes the SHA-224 hash of a binary string.""" + """Computes the SHA-224 hash of a binary string. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select( + ... dfn.functions.sha224(dfn.col("a")).alias("h") + ... ) + >>> len(result.collect_column("h")[0].as_py()) > 0 + True + """ return Expr(f.sha224(arg.expr)) def sha256(arg: Expr) -> Expr: - """Computes the SHA-256 hash of a binary string.""" + """Computes the SHA-256 hash of a binary string. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select( + ... dfn.functions.sha256(dfn.col("a")).alias("h") + ... ) + >>> len(result.collect_column("h")[0].as_py()) > 0 + True + """ return Expr(f.sha256(arg.expr)) def sha384(arg: Expr) -> Expr: - """Computes the SHA-384 hash of a binary string.""" + """Computes the SHA-384 hash of a binary string. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select( + ... dfn.functions.sha384(dfn.col("a")).alias("h") + ... ) + >>> len(result.collect_column("h")[0].as_py()) > 0 + True + """ return Expr(f.sha384(arg.expr)) def sha512(arg: Expr) -> Expr: - """Computes the SHA-512 hash of a binary string.""" + """Computes the SHA-512 hash of a binary string. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select( + ... dfn.functions.sha512(dfn.col("a")).alias("h") + ... ) + >>> len(result.collect_column("h")[0].as_py()) > 0 + True + """ return Expr(f.sha512(arg.expr)) @@ -1370,18 +1502,55 @@ def range(start: Expr, stop: Expr, step: Expr) -> Expr: def uuid() -> Expr: - """Returns uuid v4 as a string value.""" + """Returns uuid v4 as a string value. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select( + ... dfn.functions.uuid().alias("u") + ... ) + >>> len(result.collect_column("u")[0].as_py()) == 36 + True + """ return Expr(f.uuid()) def struct(*args: Expr) -> Expr: - """Returns a struct with the given arguments.""" + """Returns a struct with the given arguments. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1], "b": [2]}) + >>> result = df.select( + ... dfn.functions.struct( + ... dfn.col("a"), dfn.col("b") + ... ).alias("s") + ... ) + >>> result.collect_column("s")[0].as_py() == {"c0": 1, "c1": 2} + True + """ args = [arg.expr for arg in args] return Expr(f.struct(*args)) def named_struct(name_pairs: list[tuple[str, Expr]]) -> Expr: - """Returns a struct with the given names and arguments pairs.""" + """Returns a struct with the given names and arguments pairs. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select( + ... dfn.functions.named_struct( + ... [("x", dfn.lit(10)), ("y", dfn.lit(20))] + ... ).alias("s") + ... ) + >>> result.collect_column("s")[0].as_py() == {"x": 10, "y": 20} + True + """ name_pair_exprs = [ [Expr.literal(pa.scalar(pair[0], type=pa.string())), pair[1]] for pair in name_pairs @@ -1398,12 +1567,31 @@ def from_unixtime(arg: Expr) -> Expr: def arrow_typeof(arg: Expr) -> Expr: - """Returns the Arrow type of the expression.""" + """Returns the Arrow type of the expression. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select(dfn.functions.arrow_typeof(dfn.col("a")).alias("t")) + >>> result.collect_column("t")[0].as_py() + 'Int64' + """ return Expr(f.arrow_typeof(arg.expr)) def arrow_cast(expr: Expr, data_type: Expr) -> Expr: - """Casts an expression to a specified data type.""" + """Casts an expression to a specified data type. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> data_type = dfn.string_literal("Float64")).alias("c") + >>> result = df.select(dfn.functions.arrow_cast(dfn.col("a"), data_type).alias("c")) + >>> result.collect_column("c")[0].as_py() + 1.0 + """ return Expr(f.arrow_cast(expr.expr, data_type.expr)) From 7836164d05345b7cd43b90cedec750d23022df90 Mon Sep 17 00:00:00 2001 From: ntjohnson1 <24689722+ntjohnson1@users.noreply.github.com> Date: Thu, 12 Mar 2026 04:43:14 -0400 Subject: [PATCH 2/6] Fix typo --- python/datafusion/functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 0b5bb702b..a29e20b86 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -1587,7 +1587,7 @@ def arrow_cast(expr: Expr, data_type: Expr) -> Expr: --------- >>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"a": [1]}) - >>> data_type = dfn.string_literal("Float64")).alias("c") + >>> data_type = dfn.string_literal("Float64") >>> result = df.select(dfn.functions.arrow_cast(dfn.col("a"), data_type).alias("c")) >>> result.collect_column("c")[0].as_py() 1.0 From caed23639f7bb3c5e0503cb49dbb2c53e9a17225 Mon Sep 17 00:00:00 2001 From: ntjohnson1 <24689722+ntjohnson1@users.noreply.github.com> Date: Sat, 14 Mar 2026 10:46:42 -0400 Subject: [PATCH 3/6] Fix docstring already broken that I added an example to --- python/datafusion/functions.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index a29e20b86..30391c1e0 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -1036,10 +1036,13 @@ def regexp_instr( ) -> Expr: r"""Returns the position of a regular expression match in a string. - Searches ``values`` for the ``n``-th occurrence of ``regex``, starting at position - ``start`` (the first position is 1). Returns the starting or ending position based - on ``end_position``. Use ``flags`` to control regex behavior and ``sub_expr`` to - return the position of a specific capture group instead of the entire match. + Args: + values: Data to search for the regular expression match. + regex: Regular expression to search for. + start: Optional position to start the search (the first position is 1). + n: Optional occurrence of the match to find (the first occurrence is 1). + flags: Optional regular expression flags to control regex behavior. + sub_expr: Optionally capture group position instead of the entire match. Examples: --------- From c942bf30ef5f34b6ecdcf96437688d28081ef15d Mon Sep 17 00:00:00 2001 From: ntjohnson1 <24689722+ntjohnson1@users.noreply.github.com> Date: Sat, 14 Mar 2026 10:55:14 -0400 Subject: [PATCH 4/6] Add sha outputs --- python/datafusion/functions.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 30391c1e0..4184158d4 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -1130,8 +1130,8 @@ def sha224(arg: Expr) -> Expr: >>> result = df.select( ... dfn.functions.sha224(dfn.col("a")).alias("h") ... ) - >>> len(result.collect_column("h")[0].as_py()) > 0 - True + >>> result.collect_column("h")[0].as_py().hex() + 'ea09ae9cc6768c50fcee903ed054556e5bfc8347907f12598aa24193' """ return Expr(f.sha224(arg.expr)) @@ -1146,8 +1146,8 @@ def sha256(arg: Expr) -> Expr: >>> result = df.select( ... dfn.functions.sha256(dfn.col("a")).alias("h") ... ) - >>> len(result.collect_column("h")[0].as_py()) > 0 - True + >>> result.collect_column("h")[0].as_py().hex() + '2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824' """ return Expr(f.sha256(arg.expr)) @@ -1162,8 +1162,8 @@ def sha384(arg: Expr) -> Expr: >>> result = df.select( ... dfn.functions.sha384(dfn.col("a")).alias("h") ... ) - >>> len(result.collect_column("h")[0].as_py()) > 0 - True + >>> result.collect_column("h")[0].as_py().hex() + '59e1748777448c69de6b800d7a33bbfb9ff1b... """ return Expr(f.sha384(arg.expr)) @@ -1178,8 +1178,8 @@ def sha512(arg: Expr) -> Expr: >>> result = df.select( ... dfn.functions.sha512(dfn.col("a")).alias("h") ... ) - >>> len(result.collect_column("h")[0].as_py()) > 0 - True + >>> result.collect_column("h")[0].as_py().hex() + '9b71d224bd62f3785d96d46ad3ea3d73319bfb... """ return Expr(f.sha512(arg.expr)) From 224a19f0d66810cba41d0c532314db6f5f4d8426 Mon Sep 17 00:00:00 2001 From: ntjohnson1 <24689722+ntjohnson1@users.noreply.github.com> Date: Sat, 14 Mar 2026 10:58:08 -0400 Subject: [PATCH 5/6] clarify struct results --- python/datafusion/functions.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 4184158d4..f49ddfb3b 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -1532,6 +1532,10 @@ def struct(*args: Expr) -> Expr: ... dfn.col("a"), dfn.col("b") ... ).alias("s") ... ) + + Children in the new struct will always be `c0`, ..., `cN-1` + for `N` children. + >>> result.collect_column("s")[0].as_py() == {"c0": 1, "c1": 2} True """ From dd1fa5fe385983d9c0ce1c855a398201b1ea1cde Mon Sep 17 00:00:00 2001 From: ntjohnson1 <24689722+ntjohnson1@users.noreply.github.com> Date: Sat, 14 Mar 2026 11:02:56 -0400 Subject: [PATCH 6/6] Examples should follow google docstyle --- python/datafusion/functions.py | 479 ++++++++++++++++----------------- 1 file changed, 225 insertions(+), 254 deletions(-) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index f49ddfb3b..73df56643 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -496,12 +496,11 @@ def acos(arg: Expr) -> Expr: """Returns the arc cosine or inverse cosine of a number. Examples: - --------- - >>> ctx = dfn.SessionContext() - >>> df = ctx.from_pydict({"a": [1.0]}) - >>> result = df.select(dfn.functions.acos(dfn.col("a")).alias("acos")) - >>> result.collect_column("acos")[0].as_py() - 0.0 + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0]}) + >>> result = df.select(dfn.functions.acos(dfn.col("a")).alias("acos")) + >>> result.collect_column("acos")[0].as_py() + 0.0 """ return Expr(f.acos(arg.expr)) @@ -510,12 +509,11 @@ def acosh(arg: Expr) -> Expr: """Returns inverse hyperbolic cosine. Examples: - --------- - >>> ctx = dfn.SessionContext() - >>> df = ctx.from_pydict({"a": [1.0]}) - >>> result = df.select(dfn.functions.acosh(dfn.col("a")).alias("acosh")) - >>> result.collect_column("acosh")[0].as_py() - 0.0 + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0]}) + >>> result = df.select(dfn.functions.acosh(dfn.col("a")).alias("acosh")) + >>> result.collect_column("acosh")[0].as_py() + 0.0 """ return Expr(f.acosh(arg.expr)) @@ -529,12 +527,11 @@ def asin(arg: Expr) -> Expr: """Returns the arc sine or inverse sine of a number. Examples: - --------- - >>> ctx = dfn.SessionContext() - >>> df = ctx.from_pydict({"a": [0.0]}) - >>> result = df.select(dfn.functions.asin(dfn.col("a")).alias("asin")) - >>> result.collect_column("asin")[0].as_py() - 0.0 + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0.0]}) + >>> result = df.select(dfn.functions.asin(dfn.col("a")).alias("asin")) + >>> result.collect_column("asin")[0].as_py() + 0.0 """ return Expr(f.asin(arg.expr)) @@ -543,12 +540,11 @@ def asinh(arg: Expr) -> Expr: """Returns inverse hyperbolic sine. Examples: - --------- - >>> ctx = dfn.SessionContext() - >>> df = ctx.from_pydict({"a": [0.0]}) - >>> result = df.select(dfn.functions.asinh(dfn.col("a")).alias("asinh")) - >>> result.collect_column("asinh")[0].as_py() - 0.0 + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0.0]}) + >>> result = df.select(dfn.functions.asinh(dfn.col("a")).alias("asinh")) + >>> result.collect_column("asinh")[0].as_py() + 0.0 """ return Expr(f.asinh(arg.expr)) @@ -557,12 +553,11 @@ def atan(arg: Expr) -> Expr: """Returns inverse tangent of a number. Examples: - --------- - >>> ctx = dfn.SessionContext() - >>> df = ctx.from_pydict({"a": [0.0]}) - >>> result = df.select(dfn.functions.atan(dfn.col("a")).alias("atan")) - >>> result.collect_column("atan")[0].as_py() - 0.0 + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0.0]}) + >>> result = df.select(dfn.functions.atan(dfn.col("a")).alias("atan")) + >>> result.collect_column("atan")[0].as_py() + 0.0 """ return Expr(f.atan(arg.expr)) @@ -571,12 +566,11 @@ def atanh(arg: Expr) -> Expr: """Returns inverse hyperbolic tangent. Examples: - --------- - >>> ctx = dfn.SessionContext() - >>> df = ctx.from_pydict({"a": [0.0]}) - >>> result = df.select(dfn.functions.atanh(dfn.col("a")).alias("atanh")) - >>> result.collect_column("atanh")[0].as_py() - 0.0 + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0.0]}) + >>> result = df.select(dfn.functions.atanh(dfn.col("a")).alias("atanh")) + >>> result.collect_column("atanh")[0].as_py() + 0.0 """ return Expr(f.atanh(arg.expr)) @@ -585,13 +579,12 @@ def atan2(y: Expr, x: Expr) -> Expr: """Returns inverse tangent of a division given in the argument. Examples: - --------- - >>> ctx = dfn.SessionContext() - >>> df = ctx.from_pydict({"y": [0.0], "x": [1.0]}) - >>> result = df.select( - ... dfn.functions.atan2(dfn.col("y"), dfn.col("x")).alias("atan2")) - >>> result.collect_column("atan2")[0].as_py() - 0.0 + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"y": [0.0], "x": [1.0]}) + >>> result = df.select( + ... dfn.functions.atan2(dfn.col("y"), dfn.col("x")).alias("atan2")) + >>> result.collect_column("atan2")[0].as_py() + 0.0 """ return Expr(f.atan2(y.expr, x.expr)) @@ -640,13 +633,12 @@ def coalesce(*args: Expr) -> Expr: """Returns the value of the first expr in ``args`` which is not NULL. Examples: - --------- - >>> ctx = dfn.SessionContext() - >>> df = ctx.from_pydict({"a": [None, 1], "b": [2, 3]}) - >>> result = df.select( - ... dfn.functions.coalesce(dfn.col("a"), dfn.col("b")).alias("c")) - >>> result.collect_column("c")[0].as_py() - 2 + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [None, 1], "b": [2, 3]}) + >>> result = df.select( + ... dfn.functions.coalesce(dfn.col("a"), dfn.col("b")).alias("c")) + >>> result.collect_column("c")[0].as_py() + 2 """ args = [arg.expr for arg in args] return Expr(f.coalesce(*args)) @@ -656,12 +648,11 @@ def cos(arg: Expr) -> Expr: """Returns the cosine of the argument. Examples: - --------- - >>> ctx = dfn.SessionContext() - >>> df = ctx.from_pydict({"a": [0,-1,1]}) - >>> cos_df = df.select(dfn.functions.cos(dfn.col("a")).alias("cos")) - >>> cos_df.collect_column("cos")[0].as_py() - 1.0 + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0,-1,1]}) + >>> cos_df = df.select(dfn.functions.cos(dfn.col("a")).alias("cos")) + >>> cos_df.collect_column("cos")[0].as_py() + 1.0 """ return Expr(f.cos(arg.expr)) @@ -670,12 +661,11 @@ def cosh(arg: Expr) -> Expr: """Returns the hyperbolic cosine of the argument. Examples: - --------- - >>> ctx = dfn.SessionContext() - >>> df = ctx.from_pydict({"a": [0,-1,1]}) - >>> cosh_df = df.select(dfn.functions.cosh(dfn.col("a")).alias("cosh")) - >>> cosh_df.collect_column("cosh")[0].as_py() - 1.0 + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0,-1,1]}) + >>> cosh_df = df.select(dfn.functions.cosh(dfn.col("a")).alias("cosh")) + >>> cosh_df.collect_column("cosh")[0].as_py() + 1.0 """ return Expr(f.cosh(arg.expr)) @@ -684,18 +674,17 @@ def cot(arg: Expr) -> Expr: """Returns the cotangent of the argument. Examples: - --------- - >>> from math import pi - >>> ctx = dfn.SessionContext() - >>> df = ctx.from_pydict({"a": [pi / 4]}) - >>> import builtins - >>> result = df.select( - ... dfn.functions.cot(dfn.col("a")).alias("cot") - ... ) - >>> builtins.round( - ... result.collect_column("cot")[0].as_py(), 1 - ... ) - 1.0 + >>> from math import pi + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [pi / 4]}) + >>> import builtins + >>> result = df.select( + ... dfn.functions.cot(dfn.col("a")).alias("cot") + ... ) + >>> builtins.round( + ... result.collect_column("cot")[0].as_py(), 1 + ... ) + 1.0 """ return Expr(f.cot(arg.expr)) @@ -704,13 +693,12 @@ def degrees(arg: Expr) -> Expr: """Converts the argument from radians to degrees. Examples: - --------- - >>> from math import pi - >>> ctx = dfn.SessionContext() - >>> df = ctx.from_pydict({"a": [0,pi,2*pi]}) - >>> deg_df = df.select(dfn.functions.degrees(dfn.col("a")).alias("deg")) - >>> deg_df.collect_column("deg")[2].as_py() - 360.0 + >>> from math import pi + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0,pi,2*pi]}) + >>> deg_df = df.select(dfn.functions.degrees(dfn.col("a")).alias("deg")) + >>> deg_df.collect_column("deg")[2].as_py() + 360.0 """ return Expr(f.degrees(arg.expr)) @@ -833,12 +821,11 @@ def md5(arg: Expr) -> Expr: """Computes an MD5 128-bit checksum for a string expression. Examples: - --------- - >>> ctx = dfn.SessionContext() - >>> df = ctx.from_pydict({"a": ["hello"]}) - >>> result = df.select(dfn.functions.md5(dfn.col("a")).alias("md5")) - >>> result.collect_column("md5")[0].as_py() - '5d41402abc4b2a76b9719d911017c592' + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select(dfn.functions.md5(dfn.col("a")).alias("md5")) + >>> result.collect_column("md5")[0].as_py() + '5d41402abc4b2a76b9719d911017c592' """ return Expr(f.md5(arg.expr)) @@ -852,14 +839,15 @@ def nvl(x: Expr, y: Expr) -> Expr: """Returns ``x`` if ``x`` is not ``NULL``. Otherwise returns ``y``. Examples: - --------- - >>> ctx = dfn.SessionContext() - >>> df = ctx.from_pydict({"a": [None, 1], "b": [0, 0]}) - >>> nvl_df = df.select(dfn.functions.nvl(dfn.col("a"), dfn.col("b")).alias("nvl")) - >>> nvl_df.collect_column("nvl")[0].as_py() - 0 - >>> nvl_df.collect_column("nvl")[1].as_py() - 1 + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [None, 1], "b": [0, 0]}) + >>> nvl_df = df.select( + ... dfn.functions.nvl(dfn.col("a"), dfn.col("b")).alias("nvl") + ... ) + >>> nvl_df.collect_column("nvl")[0].as_py() + 0 + >>> nvl_df.collect_column("nvl")[1].as_py() + 1 """ return Expr(f.nvl(x.expr, y.expr)) @@ -912,18 +900,17 @@ def radians(arg: Expr) -> Expr: """Converts the argument from degrees to radians. Examples: - --------- - >>> from math import pi - >>> ctx = dfn.SessionContext() - >>> df = ctx.from_pydict({"a": [180.0]}) - >>> import builtins - >>> result = df.select( - ... dfn.functions.radians(dfn.col("a")).alias("rad") - ... ) - >>> builtins.round( - ... result.collect_column("rad")[0].as_py(), 6 - ... ) - 3.141593 + >>> from math import pi + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [180.0]}) + >>> import builtins + >>> result = df.select( + ... dfn.functions.radians(dfn.col("a")).alias("rad") + ... ) + >>> builtins.round( + ... result.collect_column("rad")[0].as_py(), 6 + ... ) + 3.141593 """ return Expr(f.radians(arg.expr)) @@ -935,16 +922,15 @@ def regexp_like(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr: false otherwise. Examples: - --------- - >>> ctx = dfn.SessionContext() - >>> df = ctx.from_pydict({"a": ["hello123"]}) - >>> result = df.select( - ... dfn.functions.regexp_like( - ... dfn.col("a"), dfn.lit("\\d+") - ... ).alias("m") - ... ) - >>> result.collect_column("m")[0].as_py() - True + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello123"]}) + >>> result = df.select( + ... dfn.functions.regexp_like( + ... dfn.col("a"), dfn.lit("\\d+") + ... ).alias("m") + ... ) + >>> result.collect_column("m")[0].as_py() + True """ if flags is not None: flags = flags.expr @@ -958,16 +944,15 @@ def regexp_match(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr: corresponding index in ``regex`` to string in ``string``. Examples: - --------- - >>> ctx = dfn.SessionContext() - >>> df = ctx.from_pydict({"a": ["hello 42 world"]}) - >>> result = df.select( - ... dfn.functions.regexp_match( - ... dfn.col("a"), dfn.lit("(\\d+)") - ... ).alias("m") - ... ) - >>> result.collect_column("m")[0].as_py() - ['42'] + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello 42 world"]}) + >>> result = df.select( + ... dfn.functions.regexp_match( + ... dfn.col("a"), dfn.lit("(\\d+)") + ... ).alias("m") + ... ) + >>> result.collect_column("m")[0].as_py() + ['42'] """ if flags is not None: flags = flags.expr @@ -986,17 +971,16 @@ def regexp_replace( Examples: - --------- - >>> ctx = dfn.SessionContext() - >>> df = ctx.from_pydict({"a": ["hello 42"]}) - >>> result = df.select( - ... dfn.functions.regexp_replace( - ... dfn.col("a"), dfn.lit("\\d+"), - ... dfn.lit("XX") - ... ).alias("r") - ... ) - >>> result.collect_column("r")[0].as_py() - 'hello XX' + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello 42"]}) + >>> result = df.select( + ... dfn.functions.regexp_replace( + ... dfn.col("a"), dfn.lit("\\d+"), + ... dfn.lit("XX") + ... ).alias("r") + ... ) + >>> result.collect_column("r")[0].as_py() + 'hello XX' """ if flags is not None: flags = flags.expr @@ -1012,13 +996,12 @@ def regexp_count( expression. Examples: - --------- - >>> ctx = dfn.SessionContext() - >>> df = ctx.from_pydict({"a": ["abcabc"]}) - >>> result = df.select( - ... dfn.functions.regexp_count(dfn.col("a"), dfn.lit("abc")).alias("c")) - >>> result.collect_column("c")[0].as_py() - 2 + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["abcabc"]}) + >>> result = df.select( + ... dfn.functions.regexp_count(dfn.col("a"), dfn.lit("abc")).alias("c")) + >>> result.collect_column("c")[0].as_py() + 2 """ if flags is not None: flags = flags.expr @@ -1045,16 +1028,15 @@ def regexp_instr( sub_expr: Optionally capture group position instead of the entire match. Examples: - --------- - >>> ctx = dfn.SessionContext() - >>> df = ctx.from_pydict({"a": ["hello 42 world"]}) - >>> result = df.select( - ... dfn.functions.regexp_instr( - ... dfn.col("a"), dfn.lit("\\d+") - ... ).alias("pos") - ... ) - >>> result.collect_column("pos")[0].as_py() - 7 + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello 42 world"]}) + >>> result = df.select( + ... dfn.functions.regexp_instr( + ... dfn.col("a"), dfn.lit("\\d+") + ... ).alias("pos") + ... ) + >>> result.collect_column("pos")[0].as_py() + 7 """ start = start.expr if start is not None else None n = n.expr if n is not None else None @@ -1124,14 +1106,13 @@ def sha224(arg: Expr) -> Expr: """Computes the SHA-224 hash of a binary string. Examples: - --------- - >>> ctx = dfn.SessionContext() - >>> df = ctx.from_pydict({"a": ["hello"]}) - >>> result = df.select( - ... dfn.functions.sha224(dfn.col("a")).alias("h") - ... ) - >>> result.collect_column("h")[0].as_py().hex() - 'ea09ae9cc6768c50fcee903ed054556e5bfc8347907f12598aa24193' + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select( + ... dfn.functions.sha224(dfn.col("a")).alias("h") + ... ) + >>> result.collect_column("h")[0].as_py().hex() + 'ea09ae9cc6768c50fcee903ed054556e5bfc8347907f12598aa24193' """ return Expr(f.sha224(arg.expr)) @@ -1140,14 +1121,13 @@ def sha256(arg: Expr) -> Expr: """Computes the SHA-256 hash of a binary string. Examples: - --------- - >>> ctx = dfn.SessionContext() - >>> df = ctx.from_pydict({"a": ["hello"]}) - >>> result = df.select( - ... dfn.functions.sha256(dfn.col("a")).alias("h") - ... ) - >>> result.collect_column("h")[0].as_py().hex() - '2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824' + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select( + ... dfn.functions.sha256(dfn.col("a")).alias("h") + ... ) + >>> result.collect_column("h")[0].as_py().hex() + '2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824' """ return Expr(f.sha256(arg.expr)) @@ -1156,14 +1136,13 @@ def sha384(arg: Expr) -> Expr: """Computes the SHA-384 hash of a binary string. Examples: - --------- - >>> ctx = dfn.SessionContext() - >>> df = ctx.from_pydict({"a": ["hello"]}) - >>> result = df.select( - ... dfn.functions.sha384(dfn.col("a")).alias("h") - ... ) - >>> result.collect_column("h")[0].as_py().hex() - '59e1748777448c69de6b800d7a33bbfb9ff1b... + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select( + ... dfn.functions.sha384(dfn.col("a")).alias("h") + ... ) + >>> result.collect_column("h")[0].as_py().hex() + '59e1748777448c69de6b800d7a33bbfb9ff1b... """ return Expr(f.sha384(arg.expr)) @@ -1172,14 +1151,13 @@ def sha512(arg: Expr) -> Expr: """Computes the SHA-512 hash of a binary string. Examples: - --------- - >>> ctx = dfn.SessionContext() - >>> df = ctx.from_pydict({"a": ["hello"]}) - >>> result = df.select( - ... dfn.functions.sha512(dfn.col("a")).alias("h") - ... ) - >>> result.collect_column("h")[0].as_py().hex() - '9b71d224bd62f3785d96d46ad3ea3d73319bfb... + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select( + ... dfn.functions.sha512(dfn.col("a")).alias("h") + ... ) + >>> result.collect_column("h")[0].as_py().hex() + '9b71d224bd62f3785d96d46ad3ea3d73319bfb... """ return Expr(f.sha512(arg.expr)) @@ -1193,12 +1171,11 @@ def sin(arg: Expr) -> Expr: """Returns the sine of the argument. Examples: - --------- - >>> ctx = dfn.SessionContext() - >>> df = ctx.from_pydict({"a": [0.0]}) - >>> result = df.select(dfn.functions.sin(dfn.col("a")).alias("sin")) - >>> result.collect_column("sin")[0].as_py() - 0.0 + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0.0]}) + >>> result = df.select(dfn.functions.sin(dfn.col("a")).alias("sin")) + >>> result.collect_column("sin")[0].as_py() + 0.0 """ return Expr(f.sin(arg.expr)) @@ -1207,12 +1184,11 @@ def sinh(arg: Expr) -> Expr: """Returns the hyperbolic sine of the argument. Examples: - --------- - >>> ctx = dfn.SessionContext() - >>> df = ctx.from_pydict({"a": [0.0]}) - >>> result = df.select(dfn.functions.sinh(dfn.col("a")).alias("sinh")) - >>> result.collect_column("sinh")[0].as_py() - 0.0 + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0.0]}) + >>> result = df.select(dfn.functions.sinh(dfn.col("a")).alias("sinh")) + >>> result.collect_column("sinh")[0].as_py() + 0.0 """ return Expr(f.sinh(arg.expr)) @@ -1264,12 +1240,11 @@ def tan(arg: Expr) -> Expr: """Returns the tangent of the argument. Examples: - --------- - >>> ctx = dfn.SessionContext() - >>> df = ctx.from_pydict({"a": [0.0]}) - >>> result = df.select(dfn.functions.tan(dfn.col("a")).alias("tan")) - >>> result.collect_column("tan")[0].as_py() - 0.0 + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0.0]}) + >>> result = df.select(dfn.functions.tan(dfn.col("a")).alias("tan")) + >>> result.collect_column("tan")[0].as_py() + 0.0 """ return Expr(f.tan(arg.expr)) @@ -1278,12 +1253,11 @@ def tanh(arg: Expr) -> Expr: """Returns the hyperbolic tangent of the argument. Examples: - --------- - >>> ctx = dfn.SessionContext() - >>> df = ctx.from_pydict({"a": [0.0]}) - >>> result = df.select(dfn.functions.tanh(dfn.col("a")).alias("tanh")) - >>> result.collect_column("tanh")[0].as_py() - 0.0 + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0.0]}) + >>> result = df.select(dfn.functions.tanh(dfn.col("a")).alias("tanh")) + >>> result.collect_column("tanh")[0].as_py() + 0.0 """ return Expr(f.tanh(arg.expr)) @@ -1508,14 +1482,13 @@ def uuid() -> Expr: """Returns uuid v4 as a string value. Examples: - --------- - >>> ctx = dfn.SessionContext() - >>> df = ctx.from_pydict({"a": [1]}) - >>> result = df.select( - ... dfn.functions.uuid().alias("u") - ... ) - >>> len(result.collect_column("u")[0].as_py()) == 36 - True + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select( + ... dfn.functions.uuid().alias("u") + ... ) + >>> len(result.collect_column("u")[0].as_py()) == 36 + True """ return Expr(f.uuid()) @@ -1524,20 +1497,19 @@ def struct(*args: Expr) -> Expr: """Returns a struct with the given arguments. Examples: - --------- - >>> ctx = dfn.SessionContext() - >>> df = ctx.from_pydict({"a": [1], "b": [2]}) - >>> result = df.select( - ... dfn.functions.struct( - ... dfn.col("a"), dfn.col("b") - ... ).alias("s") - ... ) + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1], "b": [2]}) + >>> result = df.select( + ... dfn.functions.struct( + ... dfn.col("a"), dfn.col("b") + ... ).alias("s") + ... ) - Children in the new struct will always be `c0`, ..., `cN-1` - for `N` children. + Children in the new struct will always be `c0`, ..., `cN-1` + for `N` children. - >>> result.collect_column("s")[0].as_py() == {"c0": 1, "c1": 2} - True + >>> result.collect_column("s")[0].as_py() == {"c0": 1, "c1": 2} + True """ args = [arg.expr for arg in args] return Expr(f.struct(*args)) @@ -1547,16 +1519,15 @@ def named_struct(name_pairs: list[tuple[str, Expr]]) -> Expr: """Returns a struct with the given names and arguments pairs. Examples: - --------- - >>> ctx = dfn.SessionContext() - >>> df = ctx.from_pydict({"a": [1]}) - >>> result = df.select( - ... dfn.functions.named_struct( - ... [("x", dfn.lit(10)), ("y", dfn.lit(20))] - ... ).alias("s") - ... ) - >>> result.collect_column("s")[0].as_py() == {"x": 10, "y": 20} - True + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select( + ... dfn.functions.named_struct( + ... [("x", dfn.lit(10)), ("y", dfn.lit(20))] + ... ).alias("s") + ... ) + >>> result.collect_column("s")[0].as_py() == {"x": 10, "y": 20} + True """ name_pair_exprs = [ [Expr.literal(pa.scalar(pair[0], type=pa.string())), pair[1]] @@ -1577,12 +1548,11 @@ def arrow_typeof(arg: Expr) -> Expr: """Returns the Arrow type of the expression. Examples: - --------- - >>> ctx = dfn.SessionContext() - >>> df = ctx.from_pydict({"a": [1]}) - >>> result = df.select(dfn.functions.arrow_typeof(dfn.col("a")).alias("t")) - >>> result.collect_column("t")[0].as_py() - 'Int64' + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select(dfn.functions.arrow_typeof(dfn.col("a")).alias("t")) + >>> result.collect_column("t")[0].as_py() + 'Int64' """ return Expr(f.arrow_typeof(arg.expr)) @@ -1591,13 +1561,14 @@ def arrow_cast(expr: Expr, data_type: Expr) -> Expr: """Casts an expression to a specified data type. Examples: - --------- - >>> ctx = dfn.SessionContext() - >>> df = ctx.from_pydict({"a": [1]}) - >>> data_type = dfn.string_literal("Float64") - >>> result = df.select(dfn.functions.arrow_cast(dfn.col("a"), data_type).alias("c")) - >>> result.collect_column("c")[0].as_py() - 1.0 + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> data_type = dfn.string_literal("Float64") + >>> result = df.select( + ... dfn.functions.arrow_cast(dfn.col("a"), data_type).alias("c") + ... ) + >>> result.collect_column("c")[0].as_py() + 1.0 """ return Expr(f.arrow_cast(expr.expr, data_type.expr))