From 87fbca6399a2729e56ebf13d4eb45e4ed0443e35 Mon Sep 17 00:00:00 2001 From: luan Date: Wed, 25 Feb 2026 12:06:01 +0000 Subject: [PATCH 1/3] docs: type mapping between pyiceberg and pyarrow --- pyiceberg/type_mapping.py | 72 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 pyiceberg/type_mapping.py diff --git a/pyiceberg/type_mapping.py b/pyiceberg/type_mapping.py new file mode 100644 index 0000000000..108572d80a --- /dev/null +++ b/pyiceberg/type_mapping.py @@ -0,0 +1,72 @@ +"""Type mapping between PyArrow and Iceberg types. + +## PyArrow +The Iceberg specification only specifies type mapping for Avro, Parquet, and ORC: + +- [Iceberg to Avro](https://iceberg.apache.org/spec/#avro) + +- [Iceberg to Parquet](https://iceberg.apache.org/spec/#parquet) + +- [Iceberg to ORC](https://iceberg.apache.org/spec/#orc) + +Refer to the following tables for type mapping in both direction for PyIceberg types and PyArrow types. + +### PyIceberg to PyArrow type mapping + +| PyIceberg type class | PyArrow type | Notes | +|---------------------------------|-------------------------------------|----------------------------------------| +| `BooleanType` | `pa.bool_()` | | +| `IntegerType` | `pa.int32()` | | +| `LongType` | `pa.int64()` | | +| `FloatType` | `pa.float32()` | | +| `DoubleType` | `pa.float64()` | | +| `DecimalType(p, s)` | `pa.decimal128(p, s)` | | +| `DateType` | `pa.date32()` | | +| `TimeType` | `pa.time64("us")` | | +| `TimestampType` | `pa.timestamp("us")` | | +| `TimestampNanoType` | `pa.timestamp("ns")` | | +| `TimestamptzType` | `pa.timestamp("us", tz="UTC")` | | +| `TimestamptzNanoType` | `pa.timestamp("ns", tz="UTC")` | | +| `StringType` | `pa.large_string()` | | +| `UUIDType` | `pa.uuid()` | | +| `BinaryType` | `pa.large_binary()` | | +| `FixedType(L)` | `pa.binary(L)` | | +| `StructType` | `pa.struct()` | | +| `ListType(e)` | `pa.large_list(e)` | | +| `MapType(k, v)` | `pa.map_(k, v)` | | +| `UnknownType` | `pa.null()` | | + +--- +### PyArrow to PyIceberg type mapping + +| PyArrow type | PyIceberg type class | Notes | +|------------------------------------|-----------------------------|--------------------------------| +| `pa.bool_()` | `BooleanType` | | +| `pa.int32()` | `IntegerType` | | +| `pa.int64()` | `LongType` | | +| `pa.float32()` | `FloatType` | | +| `pa.float64()` | `DoubleType` | | +| `pa.decimal128(p, s)` | `DecimalType(p, s)` | | +| `pa.decimal256(p, s)` | Unsupported | | +| `pa.date32()` | `DateType` | | +| `pa.date64()` | Unsupported | | +| `pa.time64("us")` | `TimeType` | | +| `pa.timestamp("us")` | `TimestampType` | | +| `pa.timestamp("ns")` | `TimestampNanoType` | | +| `pa.timestamp("us", tz="UTC")` | `TimestamptzType` | | +| `pa.timestamp("ns", tz="UTC")` | `TimestamptzNanoType` | | +| `pa.string()` / `pa.large_string()`| `StringType` | | +| `pa.uuid()` | `UUIDType` | | +| `pa.binary()` / `pa.large_binary()`| `BinaryType` | | +| `pa.binary(L)` | `FixedType(L)` | Fixed-length byte arrays | +| `pa.struct([...])` | `StructType` | | +| `pa.list_(e)` / `pa.large_list(e)` | `ListType(e)` | | +| `pa.map_(k, v)` | `MapType(k, v)` | | +| `pa.null()` | `UnknownType` | | + +--- + +### Notes +- PyIceberg `GeometryType` and `GeographyType` types are mapped to a GeoArrow WKB extension type. +Otherwise, falls back to `pa.large_binary()` which stores WKB bytes. +""" From 54905d10ed3a691cb8b897726acbd578eba7adee Mon Sep 17 00:00:00 2001 From: luan Date: Wed, 25 Feb 2026 12:43:30 +0000 Subject: [PATCH 2/3] add license header --- pyiceberg/type_mapping.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pyiceberg/type_mapping.py b/pyiceberg/type_mapping.py index 108572d80a..f47b06d0da 100644 --- a/pyiceberg/type_mapping.py +++ b/pyiceberg/type_mapping.py @@ -1,3 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. """Type mapping between PyArrow and Iceberg types. ## PyArrow From 26b12e02d8b3ae43678a8be70b5a3adccc3bdb57 Mon Sep 17 00:00:00 2001 From: luan Date: Thu, 26 Feb 2026 19:41:34 +0000 Subject: [PATCH 3/3] move md file to API section --- mkdocs/docs/api.md | 79 +++++++++++++++++++++++++++++++++++ pyiceberg/type_mapping.py | 88 --------------------------------------- 2 files changed, 79 insertions(+), 88 deletions(-) delete mode 100644 pyiceberg/type_mapping.py diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md index 506547fcd6..654cb398a2 100644 --- a/mkdocs/docs/api.md +++ b/mkdocs/docs/api.md @@ -2039,3 +2039,82 @@ DataFrame() | 3 | 6 | +---+---+ ``` + +## Type mapping + +### PyArrow + +The Iceberg specification only specifies type mapping for Avro, Parquet, and ORC: + +- [Iceberg to Avro](https://iceberg.apache.org/spec/#avro) + +- [Iceberg to Parquet](https://iceberg.apache.org/spec/#parquet) + +- [Iceberg to ORC](https://iceberg.apache.org/spec/#orc) + +The following tables describe the type mappings between PyIceberg and PyArrow. In the tables below, `pa` refers to the `pyarrow` module: + +```python +import pyarrow as pa +``` + +#### PyIceberg to PyArrow type mapping + +| PyIceberg type class | PyArrow type | Notes | +|---------------------------------|-------------------------------------|----------------------------------------| +| `BooleanType` | `pa.bool_()` | | +| `IntegerType` | `pa.int32()` | | +| `LongType` | `pa.int64()` | | +| `FloatType` | `pa.float32()` | | +| `DoubleType` | `pa.float64()` | | +| `DecimalType(p, s)` | `pa.decimal128(p, s)` | | +| `DateType` | `pa.date32()` | | +| `TimeType` | `pa.time64("us")` | | +| `TimestampType` | `pa.timestamp("us")` | | +| `TimestampNanoType` | `pa.timestamp("ns")` | | +| `TimestamptzType` | `pa.timestamp("us", tz="UTC")` | | +| `TimestamptzNanoType` | `pa.timestamp("ns", tz="UTC")` | | +| `StringType` | `pa.large_string()` | | +| `UUIDType` | `pa.uuid()` | | +| `BinaryType` | `pa.large_binary()` | | +| `FixedType(L)` | `pa.binary(L)` | | +| `StructType` | `pa.struct()` | | +| `ListType(e)` | `pa.large_list(e)` | | +| `MapType(k, v)` | `pa.map_(k, v)` | | +| `UnknownType` | `pa.null()` | | + +--- + +#### PyArrow to PyIceberg type mapping + +| PyArrow type | PyIceberg type class | Notes | +|------------------------------------|-----------------------------|--------------------------------| +| `pa.bool_()` | `BooleanType` | | +| `pa.int32()` | `IntegerType` | | +| `pa.int64()` | `LongType` | | +| `pa.float32()` | `FloatType` | | +| `pa.float64()` | `DoubleType` | | +| `pa.decimal128(p, s)` | `DecimalType(p, s)` | | +| `pa.decimal256(p, s)` | Unsupported | | +| `pa.date32()` | `DateType` | | +| `pa.date64()` | Unsupported | | +| `pa.time64("us")` | `TimeType` | | +| `pa.timestamp("us")` | `TimestampType` | | +| `pa.timestamp("ns")` | `TimestampNanoType` | | +| `pa.timestamp("us", tz="UTC")` | `TimestamptzType` | | +| `pa.timestamp("ns", tz="UTC")` | `TimestamptzNanoType` | | +| `pa.string()` / `pa.large_string()`| `StringType` | | +| `pa.uuid()` | `UUIDType` | | +| `pa.binary()` / `pa.large_binary()`| `BinaryType` | | +| `pa.binary(L)` | `FixedType(L)` | Fixed-length byte arrays | +| `pa.struct([...])` | `StructType` | | +| `pa.list_(e)` / `pa.large_list(e)` | `ListType(e)` | | +| `pa.map_(k, v)` | `MapType(k, v)` | | +| `pa.null()` | `UnknownType` | | + +--- + +***Notes*** + +- PyIceberg `GeometryType` and `GeographyType` types are mapped to a GeoArrow WKB extension type. +Otherwise, falls back to `pa.large_binary()` which stores WKB bytes. diff --git a/pyiceberg/type_mapping.py b/pyiceberg/type_mapping.py deleted file mode 100644 index f47b06d0da..0000000000 --- a/pyiceberg/type_mapping.py +++ /dev/null @@ -1,88 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -"""Type mapping between PyArrow and Iceberg types. - -## PyArrow -The Iceberg specification only specifies type mapping for Avro, Parquet, and ORC: - -- [Iceberg to Avro](https://iceberg.apache.org/spec/#avro) - -- [Iceberg to Parquet](https://iceberg.apache.org/spec/#parquet) - -- [Iceberg to ORC](https://iceberg.apache.org/spec/#orc) - -Refer to the following tables for type mapping in both direction for PyIceberg types and PyArrow types. - -### PyIceberg to PyArrow type mapping - -| PyIceberg type class | PyArrow type | Notes | -|---------------------------------|-------------------------------------|----------------------------------------| -| `BooleanType` | `pa.bool_()` | | -| `IntegerType` | `pa.int32()` | | -| `LongType` | `pa.int64()` | | -| `FloatType` | `pa.float32()` | | -| `DoubleType` | `pa.float64()` | | -| `DecimalType(p, s)` | `pa.decimal128(p, s)` | | -| `DateType` | `pa.date32()` | | -| `TimeType` | `pa.time64("us")` | | -| `TimestampType` | `pa.timestamp("us")` | | -| `TimestampNanoType` | `pa.timestamp("ns")` | | -| `TimestamptzType` | `pa.timestamp("us", tz="UTC")` | | -| `TimestamptzNanoType` | `pa.timestamp("ns", tz="UTC")` | | -| `StringType` | `pa.large_string()` | | -| `UUIDType` | `pa.uuid()` | | -| `BinaryType` | `pa.large_binary()` | | -| `FixedType(L)` | `pa.binary(L)` | | -| `StructType` | `pa.struct()` | | -| `ListType(e)` | `pa.large_list(e)` | | -| `MapType(k, v)` | `pa.map_(k, v)` | | -| `UnknownType` | `pa.null()` | | - ---- -### PyArrow to PyIceberg type mapping - -| PyArrow type | PyIceberg type class | Notes | -|------------------------------------|-----------------------------|--------------------------------| -| `pa.bool_()` | `BooleanType` | | -| `pa.int32()` | `IntegerType` | | -| `pa.int64()` | `LongType` | | -| `pa.float32()` | `FloatType` | | -| `pa.float64()` | `DoubleType` | | -| `pa.decimal128(p, s)` | `DecimalType(p, s)` | | -| `pa.decimal256(p, s)` | Unsupported | | -| `pa.date32()` | `DateType` | | -| `pa.date64()` | Unsupported | | -| `pa.time64("us")` | `TimeType` | | -| `pa.timestamp("us")` | `TimestampType` | | -| `pa.timestamp("ns")` | `TimestampNanoType` | | -| `pa.timestamp("us", tz="UTC")` | `TimestamptzType` | | -| `pa.timestamp("ns", tz="UTC")` | `TimestamptzNanoType` | | -| `pa.string()` / `pa.large_string()`| `StringType` | | -| `pa.uuid()` | `UUIDType` | | -| `pa.binary()` / `pa.large_binary()`| `BinaryType` | | -| `pa.binary(L)` | `FixedType(L)` | Fixed-length byte arrays | -| `pa.struct([...])` | `StructType` | | -| `pa.list_(e)` / `pa.large_list(e)` | `ListType(e)` | | -| `pa.map_(k, v)` | `MapType(k, v)` | | -| `pa.null()` | `UnknownType` | | - ---- - -### Notes -- PyIceberg `GeometryType` and `GeographyType` types are mapped to a GeoArrow WKB extension type. -Otherwise, falls back to `pa.large_binary()` which stores WKB bytes. -"""