%PDF- %PDF-
| Direktori : /backups/router/usr/local/lib/python3.11/site-packages/duckdb/experimental/spark/sql/ |
| Current File : //backups/router/usr/local/lib/python3.11/site-packages/duckdb/experimental/spark/sql/readwriter.py |
from typing import TYPE_CHECKING, List, Optional, Union, cast
from ..exception import ContributionsAcceptedError
from .types import StructType
from ..errors import PySparkNotImplementedError, PySparkTypeError
PrimitiveType = Union[bool, float, int, str]
OptionalPrimitiveType = Optional[PrimitiveType]
if TYPE_CHECKING:
from duckdb.experimental.spark.sql.dataframe import DataFrame
from duckdb.experimental.spark.sql.session import SparkSession
class DataFrameWriter:
def __init__(self, dataframe: "DataFrame"):
self.dataframe = dataframe
def saveAsTable(self, table_name: str) -> None:
relation = self.dataframe.relation
relation.create(table_name)
def parquet(
self,
path: str,
mode: Optional[str] = None,
partitionBy: Union[str, List[str], None] = None,
compression: Optional[str] = None,
) -> None:
relation = self.dataframe.relation
if mode:
raise NotImplementedError
if partitionBy:
raise NotImplementedError
relation.write_parquet(path, compression=compression)
def csv(
self,
path: str,
mode: Optional[str] = None,
compression: Optional[str] = None,
sep: Optional[str] = None,
quote: Optional[str] = None,
escape: Optional[str] = None,
header: Optional[Union[bool, str]] = None,
nullValue: Optional[str] = None,
escapeQuotes: Optional[Union[bool, str]] = None,
quoteAll: Optional[Union[bool, str]] = None,
dateFormat: Optional[str] = None,
timestampFormat: Optional[str] = None,
ignoreLeadingWhiteSpace: Optional[Union[bool, str]] = None,
ignoreTrailingWhiteSpace: Optional[Union[bool, str]] = None,
charToEscapeQuoteEscaping: Optional[str] = None,
encoding: Optional[str] = None,
emptyValue: Optional[str] = None,
lineSep: Optional[str] = None,
):
if mode not in (None, "overwrite"):
raise NotImplementedError
if escapeQuotes:
raise NotImplementedError
if ignoreLeadingWhiteSpace:
raise NotImplementedError
if ignoreTrailingWhiteSpace:
raise NotImplementedError
if charToEscapeQuoteEscaping:
raise NotImplementedError
if emptyValue:
raise NotImplementedError
if lineSep:
raise NotImplementedError
relation = self.dataframe.relation
relation.write_csv(
path,
sep=sep,
na_rep=nullValue,
quotechar=quote,
compression=compression,
escapechar=escape,
header=header if isinstance(header, bool) else header == "True",
encoding=encoding,
quoting=quoteAll,
date_format=dateFormat,
timestamp_format=timestampFormat,
)
class DataFrameReader:
def __init__(self, session: "SparkSession"):
self.session = session
def load(
self,
path: Optional[Union[str, List[str]]] = None,
format: Optional[str] = None,
schema: Optional[Union[StructType, str]] = None,
**options: OptionalPrimitiveType,
) -> "DataFrame":
from duckdb.experimental.spark.sql.dataframe import DataFrame
if not isinstance(path, str):
raise ImportError
if options:
raise ContributionsAcceptedError
rel = None
if format:
format = format.lower()
if format == "csv" or format == "tsv":
rel = self.session.conn.read_csv(path)
elif format == "json":
rel = self.session.conn.read_json(path)
elif format == "parquet":
rel = self.session.conn.read_parquet(path)
else:
raise ContributionsAcceptedError
else:
rel = self.session.conn.sql(f"select * from {path}")
df = DataFrame(rel, self.session)
if schema:
if not isinstance(schema, StructType):
raise ContributionsAcceptedError
schema = cast(StructType, schema)
types, names = schema.extract_types_and_names()
df = df._cast_types(types)
df = df.toDF(names)
raise NotImplementedError
def csv(
self,
path: Union[str, List[str]],
schema: Optional[Union[StructType, str]] = None,
sep: Optional[str] = None,
encoding: Optional[str] = None,
quote: Optional[str] = None,
escape: Optional[str] = None,
comment: Optional[str] = None,
header: Optional[Union[bool, str]] = None,
inferSchema: Optional[Union[bool, str]] = None,
ignoreLeadingWhiteSpace: Optional[Union[bool, str]] = None,
ignoreTrailingWhiteSpace: Optional[Union[bool, str]] = None,
nullValue: Optional[str] = None,
nanValue: Optional[str] = None,
positiveInf: Optional[str] = None,
negativeInf: Optional[str] = None,
dateFormat: Optional[str] = None,
timestampFormat: Optional[str] = None,
maxColumns: Optional[Union[int, str]] = None,
maxCharsPerColumn: Optional[Union[int, str]] = None,
maxMalformedLogPerPartition: Optional[Union[int, str]] = None,
mode: Optional[str] = None,
columnNameOfCorruptRecord: Optional[str] = None,
multiLine: Optional[Union[bool, str]] = None,
charToEscapeQuoteEscaping: Optional[str] = None,
samplingRatio: Optional[Union[float, str]] = None,
enforceSchema: Optional[Union[bool, str]] = None,
emptyValue: Optional[str] = None,
locale: Optional[str] = None,
lineSep: Optional[str] = None,
pathGlobFilter: Optional[Union[bool, str]] = None,
recursiveFileLookup: Optional[Union[bool, str]] = None,
modifiedBefore: Optional[Union[bool, str]] = None,
modifiedAfter: Optional[Union[bool, str]] = None,
unescapedQuoteHandling: Optional[str] = None,
) -> "DataFrame":
if not isinstance(path, str):
raise NotImplementedError
if schema and not isinstance(schema, StructType):
raise ContributionsAcceptedError
if comment:
raise ContributionsAcceptedError
if inferSchema:
raise ContributionsAcceptedError
if ignoreLeadingWhiteSpace:
raise ContributionsAcceptedError
if ignoreTrailingWhiteSpace:
raise ContributionsAcceptedError
if nanValue:
raise ConnectionAbortedError
if positiveInf:
raise ConnectionAbortedError
if negativeInf:
raise ConnectionAbortedError
if negativeInf:
raise ConnectionAbortedError
if maxColumns:
raise ContributionsAcceptedError
if maxCharsPerColumn:
raise ContributionsAcceptedError
if maxMalformedLogPerPartition:
raise ContributionsAcceptedError
if mode:
raise ContributionsAcceptedError
if columnNameOfCorruptRecord:
raise ContributionsAcceptedError
if multiLine:
raise ContributionsAcceptedError
if charToEscapeQuoteEscaping:
raise ContributionsAcceptedError
if samplingRatio:
raise ContributionsAcceptedError
if enforceSchema:
raise ContributionsAcceptedError
if emptyValue:
raise ContributionsAcceptedError
if locale:
raise ContributionsAcceptedError
if pathGlobFilter:
raise ContributionsAcceptedError
if recursiveFileLookup:
raise ContributionsAcceptedError
if modifiedBefore:
raise ContributionsAcceptedError
if modifiedAfter:
raise ContributionsAcceptedError
if unescapedQuoteHandling:
raise ContributionsAcceptedError
if lineSep:
# We have support for custom newline, just needs to be ported to 'read_csv'
raise NotImplementedError
dtype = None
names = None
if schema:
schema = cast(StructType, schema)
dtype, names = schema.extract_types_and_names()
rel = self.session.conn.read_csv(
path,
header=header if isinstance(header, bool) else header == "True",
sep=sep,
dtype=dtype,
na_values=nullValue,
quotechar=quote,
escapechar=escape,
encoding=encoding,
date_format=dateFormat,
timestamp_format=timestampFormat,
)
from ..sql.dataframe import DataFrame
df = DataFrame(rel, self.session)
if names:
df = df.toDF(*names)
return df
def parquet(self, *paths: str, **options: "OptionalPrimitiveType") -> "DataFrame":
input = list(paths)
if len(input) != 1:
raise NotImplementedError("Only single paths are supported for now")
option_amount = len(options.keys())
if option_amount != 0:
raise ContributionsAcceptedError("Options are not supported")
path = input[0]
rel = self.session.conn.read_parquet(path)
from ..sql.dataframe import DataFrame
df = DataFrame(rel, self.session)
return df
def json(
self,
path: Union[str, List[str]],
schema: Optional[Union[StructType, str]] = None,
primitivesAsString: Optional[Union[bool, str]] = None,
prefersDecimal: Optional[Union[bool, str]] = None,
allowComments: Optional[Union[bool, str]] = None,
allowUnquotedFieldNames: Optional[Union[bool, str]] = None,
allowSingleQuotes: Optional[Union[bool, str]] = None,
allowNumericLeadingZero: Optional[Union[bool, str]] = None,
allowBackslashEscapingAnyCharacter: Optional[Union[bool, str]] = None,
mode: Optional[str] = None,
columnNameOfCorruptRecord: Optional[str] = None,
dateFormat: Optional[str] = None,
timestampFormat: Optional[str] = None,
multiLine: Optional[Union[bool, str]] = None,
allowUnquotedControlChars: Optional[Union[bool, str]] = None,
lineSep: Optional[str] = None,
samplingRatio: Optional[Union[float, str]] = None,
dropFieldIfAllNull: Optional[Union[bool, str]] = None,
encoding: Optional[str] = None,
locale: Optional[str] = None,
pathGlobFilter: Optional[Union[bool, str]] = None,
recursiveFileLookup: Optional[Union[bool, str]] = None,
modifiedBefore: Optional[Union[bool, str]] = None,
modifiedAfter: Optional[Union[bool, str]] = None,
allowNonNumericNumbers: Optional[Union[bool, str]] = None,
) -> "DataFrame":
"""
Loads JSON files and returns the results as a :class:`DataFrame`.
`JSON Lines <http://jsonlines.org/>`_ (newline-delimited JSON) is supported by default.
For JSON (one record per file), set the ``multiLine`` parameter to ``true``.
If the ``schema`` parameter is not specified, this function goes
through the input once to determine the input schema.
.. versionadded:: 1.4.0
.. versionchanged:: 3.4.0
Supports Spark Connect.
Parameters
----------
path : str, list or :class:`RDD`
string represents path to the JSON dataset, or a list of paths,
or RDD of Strings storing JSON objects.
schema : :class:`pyspark.sql.types.StructType` or str, optional
an optional :class:`pyspark.sql.types.StructType` for the input schema or
a DDL-formatted string (For example ``col0 INT, col1 DOUBLE``).
Other Parameters
----------------
Extra options
For the extra options, refer to
`Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option>`_
for the version you use.
.. # noqa
Examples
--------
Write a DataFrame into a JSON file and read it back.
>>> import tempfile
>>> with tempfile.TemporaryDirectory() as d:
... # Write a DataFrame into a JSON file
... spark.createDataFrame(
... [{"age": 100, "name": "Hyukjin Kwon"}]
... ).write.mode("overwrite").format("json").save(d)
...
... # Read the JSON file as a DataFrame.
... spark.read.json(d).show()
+---+------------+
|age| name|
+---+------------+
|100|Hyukjin Kwon|
+---+------------+
"""
if schema is not None:
raise ContributionsAcceptedError("The 'schema' option is not supported")
if primitivesAsString is not None:
raise ContributionsAcceptedError(
"The 'primitivesAsString' option is not supported"
)
if prefersDecimal is not None:
raise ContributionsAcceptedError(
"The 'prefersDecimal' option is not supported"
)
if allowComments is not None:
raise ContributionsAcceptedError(
"The 'allowComments' option is not supported"
)
if allowUnquotedFieldNames is not None:
raise ContributionsAcceptedError(
"The 'allowUnquotedFieldNames' option is not supported"
)
if allowSingleQuotes is not None:
raise ContributionsAcceptedError(
"The 'allowSingleQuotes' option is not supported"
)
if allowNumericLeadingZero is not None:
raise ContributionsAcceptedError(
"The 'allowNumericLeadingZero' option is not supported"
)
if allowBackslashEscapingAnyCharacter is not None:
raise ContributionsAcceptedError(
"The 'allowBackslashEscapingAnyCharacter' option is not supported"
)
if mode is not None:
raise ContributionsAcceptedError("The 'mode' option is not supported")
if columnNameOfCorruptRecord is not None:
raise ContributionsAcceptedError(
"The 'columnNameOfCorruptRecord' option is not supported"
)
if dateFormat is not None:
raise ContributionsAcceptedError("The 'dateFormat' option is not supported")
if timestampFormat is not None:
raise ContributionsAcceptedError(
"The 'timestampFormat' option is not supported"
)
if multiLine is not None:
raise ContributionsAcceptedError("The 'multiLine' option is not supported")
if allowUnquotedControlChars is not None:
raise ContributionsAcceptedError(
"The 'allowUnquotedControlChars' option is not supported"
)
if lineSep is not None:
raise ContributionsAcceptedError("The 'lineSep' option is not supported")
if samplingRatio is not None:
raise ContributionsAcceptedError(
"The 'samplingRatio' option is not supported"
)
if dropFieldIfAllNull is not None:
raise ContributionsAcceptedError(
"The 'dropFieldIfAllNull' option is not supported"
)
if encoding is not None:
raise ContributionsAcceptedError("The 'encoding' option is not supported")
if locale is not None:
raise ContributionsAcceptedError("The 'locale' option is not supported")
if pathGlobFilter is not None:
raise ContributionsAcceptedError(
"The 'pathGlobFilter' option is not supported"
)
if recursiveFileLookup is not None:
raise ContributionsAcceptedError(
"The 'recursiveFileLookup' option is not supported"
)
if modifiedBefore is not None:
raise ContributionsAcceptedError(
"The 'modifiedBefore' option is not supported"
)
if modifiedAfter is not None:
raise ContributionsAcceptedError(
"The 'modifiedAfter' option is not supported"
)
if allowNonNumericNumbers is not None:
raise ContributionsAcceptedError(
"The 'allowNonNumericNumbers' option is not supported"
)
if isinstance(path, str):
path = [path]
if isinstance(path, list):
if len(path) == 1:
rel = self.session.conn.read_json(path[0])
from .dataframe import DataFrame
df = DataFrame(rel, self.session)
return df
raise PySparkNotImplementedError(
message="Only a single path is supported for now"
)
else:
raise PySparkTypeError(
error_class="NOT_STR_OR_LIST_OF_RDD",
message_parameters={
"arg_name": "path",
"arg_type": type(path).__name__,
},
)
__all__ = ["DataFrameWriter", "DataFrameReader"]