%PDF- %PDF-
| Direktori : /backups/router/usr/local/lib/python3.11/site-packages/duckdb/experimental/spark/sql/ |
| Current File : //backups/router/usr/local/lib/python3.11/site-packages/duckdb/experimental/spark/sql/column.py |
from typing import Union, TYPE_CHECKING, Any, cast, Callable, Tuple
from ..exception import ContributionsAcceptedError
from .types import DataType
if TYPE_CHECKING:
from ._typing import ColumnOrName, LiteralType, DecimalLiteral, DateTimeLiteral
from duckdb import ConstantExpression, ColumnExpression, FunctionExpression, Expression
from duckdb.typing import DuckDBPyType
__all__ = ["Column"]
def _get_expr(x) -> Expression:
return x.expr if isinstance(x, Column) else ConstantExpression(x)
def _func_op(name: str, doc: str = "") -> Callable[["Column"], "Column"]:
def _(self: "Column") -> "Column":
njc = getattr(self.expr, name)()
return Column(njc)
_.__doc__ = doc
return _
def _unary_op(
name: str,
doc: str = "unary operator",
) -> Callable[["Column"], "Column"]:
"""Create a method for given unary operator"""
def _(self: "Column") -> "Column":
# Call the function identified by 'name' on the internal Expression object
expr = getattr(self.expr, name)()
return Column(expr)
_.__doc__ = doc
return _
def _bin_op(
name: str,
doc: str = "binary operator",
) -> Callable[["Column", Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]], "Column"]:
"""Create a method for given binary operator"""
def _(
self: "Column",
other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"],
) -> "Column":
jc = _get_expr(other)
njc = getattr(self.expr, name)(jc)
return Column(njc)
_.__doc__ = doc
return _
def _bin_func(
name: str,
doc: str = "binary function",
) -> Callable[["Column", Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]], "Column"]:
"""Create a function expression for the given binary function"""
def _(
self: "Column",
other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"],
) -> "Column":
other = _get_expr(other)
func = FunctionExpression(name, self.expr, other)
return Column(func)
_.__doc__ = doc
return _
class Column:
"""
A column in a DataFrame.
:class:`Column` instances can be created by::
# 1. Select a column out of a DataFrame
df.colName
df["colName"]
# 2. Create from an expression
df.colName + 1
1 / df.colName
.. versionadded:: 1.3.0
"""
def __init__(self, expr: Expression):
self.expr = expr
# arithmetic operators
def __neg__(self):
return Column(-self.expr)
# `and`, `or`, `not` cannot be overloaded in Python,
# so use bitwise operators as boolean operators
__and__ = _bin_op("__and__")
__or__ = _bin_op("__or__")
__invert__ = _func_op("__invert__")
__rand__ = _bin_op("__rand__")
__ror__ = _bin_op("__ror__")
__add__ = _bin_op("__add__")
__sub__ = _bin_op("__sub__")
__mul__ = _bin_op("__mul__")
__div__ = _bin_op("__div__")
__truediv__ = _bin_op("__truediv__")
__mod__ = _bin_op("__mod__")
__pow__ = _bin_op("__pow__")
__radd__ = _bin_op("__radd__")
__rsub__ = _bin_op("__rsub__")
__rmul__ = _bin_op("__rmul__")
__rdiv__ = _bin_op("__rdiv__")
__rtruediv__ = _bin_op("__rtruediv__")
__rmod__ = _bin_op("__rmod__")
__rpow__ = _bin_op("__rpow__")
def __getitem__(self, k: Any) -> "Column":
"""
An expression that gets an item at position ``ordinal`` out of a list,
or gets an item by key out of a dict.
.. versionadded:: 1.3.0
.. versionchanged:: 3.4.0
Supports Spark Connect.
Parameters
----------
k
a literal value, or a slice object without step.
Returns
-------
:class:`Column`
Column representing the item got by key out of a dict, or substrings sliced by
the given slice object.
Examples
--------
>>> df = spark.createDataFrame([('abcedfg', {"key": "value"})], ["l", "d"])
>>> df.select(df.l[slice(1, 3)], df.d['key']).show()
+------------------+------+
|substring(l, 1, 3)|d[key]|
+------------------+------+
| abc| value|
+------------------+------+
"""
if isinstance(k, slice):
raise ContributionsAcceptedError
# if k.step is not None:
# raise ValueError("Using a slice with a step value is not supported")
# return self.substr(k.start, k.stop)
else:
# FIXME: this is super hacky
expr_str = str(self.expr) + "." + str(k)
return Column(ColumnExpression(expr_str))
def __getattr__(self, item: Any) -> "Column":
"""
An expression that gets an item at position ``ordinal`` out of a list,
or gets an item by key out of a dict.
Parameters
----------
item
a literal value.
Returns
-------
:class:`Column`
Column representing the item got by key out of a dict.
Examples
--------
>>> df = spark.createDataFrame([('abcedfg', {"key": "value"})], ["l", "d"])
>>> df.select(df.d.key).show()
+------+
|d[key]|
+------+
| value|
+------+
"""
if item.startswith("__"):
raise AttributeError("Can not access __ (dunder) method")
return self[item]
def alias(self, alias: str):
return Column(self.expr.alias(alias))
def when(self, condition: "Column", value: Any):
if not isinstance(condition, Column):
raise TypeError("condition should be a Column")
v = _get_expr(value)
expr = self.expr.when(condition.expr, v)
return Column(expr)
def otherwise(self, value: Any):
v = _get_expr(value)
expr = self.expr.otherwise(v)
return Column(expr)
def cast(self, dataType: Union[DataType, str]) -> "Column":
if isinstance(dataType, str):
# Try to construct a default DuckDBPyType from it
internal_type = DuckDBPyType(dataType)
else:
internal_type = dataType.duckdb_type
return Column(self.expr.cast(internal_type))
def isin(self, *cols: Any) -> "Column":
if len(cols) == 1 and isinstance(cols[0], (list, set)):
# Only one argument supplied, it's a list
cols = cast(Tuple, cols[0])
cols = cast(
Tuple,
[_get_expr(c) for c in cols],
)
return Column(self.expr.isin(*cols))
# logistic operators
def __eq__( # type: ignore[override]
self,
other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"],
) -> "Column":
"""binary function"""
return Column(self.expr == (_get_expr(other)))
def __ne__( # type: ignore[override]
self,
other: Any,
) -> "Column":
"""binary function"""
return Column(self.expr != (_get_expr(other)))
__lt__ = _bin_op("__lt__")
__le__ = _bin_op("__le__")
__ge__ = _bin_op("__ge__")
__gt__ = _bin_op("__gt__")
# String interrogation methods
contains = _bin_func("contains")
rlike = _bin_func("regexp_matches")
like = _bin_func("~~")
ilike = _bin_func("~~*")
startswith = _bin_func("starts_with")
endswith = _bin_func("suffix")
# order
_asc_doc = """
Returns a sort expression based on the ascending order of the column.
Examples
--------
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame([('Tom', 80), ('Alice', None)], ["name", "height"])
>>> df.select(df.name).orderBy(df.name.asc()).collect()
[Row(name='Alice'), Row(name='Tom')]
"""
_asc_nulls_first_doc = """
Returns a sort expression based on ascending order of the column, and null values
return before non-null values.
Examples
--------
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])
>>> df.select(df.name).orderBy(df.name.asc_nulls_first()).collect()
[Row(name=None), Row(name='Alice'), Row(name='Tom')]
"""
_asc_nulls_last_doc = """
Returns a sort expression based on ascending order of the column, and null values
appear after non-null values.
Examples
--------
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])
>>> df.select(df.name).orderBy(df.name.asc_nulls_last()).collect()
[Row(name='Alice'), Row(name='Tom'), Row(name=None)]
"""
_desc_doc = """
Returns a sort expression based on the descending order of the column.
Examples
--------
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame([('Tom', 80), ('Alice', None)], ["name", "height"])
>>> df.select(df.name).orderBy(df.name.desc()).collect()
[Row(name='Tom'), Row(name='Alice')]
"""
_desc_nulls_first_doc = """
Returns a sort expression based on the descending order of the column, and null values
appear before non-null values.
Examples
--------
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])
>>> df.select(df.name).orderBy(df.name.desc_nulls_first()).collect()
[Row(name=None), Row(name='Tom'), Row(name='Alice')]
"""
_desc_nulls_last_doc = """
Returns a sort expression based on the descending order of the column, and null values
appear after non-null values.
Examples
--------
>>> from pyspark.sql import Row
>>> df = spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])
>>> df.select(df.name).orderBy(df.name.desc_nulls_last()).collect()
[Row(name='Tom'), Row(name='Alice'), Row(name=None)]
"""
asc = _unary_op("asc", _asc_doc)
desc = _unary_op("desc", _desc_doc)
nulls_first = _unary_op("null_first")
nulls_last = _unary_op("null_last")
def asc_nulls_first(self) -> "Column":
return self.asc().nulls_first()
def asc_nulls_last(self) -> "Column":
return self.asc().nulls_last()
def desc_nulls_first(self) -> "Column":
return self.desc().nulls_first()
def desc_nulls_last(self) -> "Column":
return self.desc().nulls_last()