Skip to content

Commit 0df51bd

Browse files
Add overwrite method to schema on schema update
1 parent 1d24e71 commit 0df51bd

File tree

3 files changed

+93
-1
lines changed

3 files changed

+93
-1
lines changed

mkdocs/docs/api.md

+30
Original file line numberDiff line numberDiff line change
@@ -1088,6 +1088,36 @@ with table.transaction() as transaction:
10881088
# ... Update properties etc
10891089
```
10901090

1091+
### Overwrite schema
1092+
1093+
To overwrite the entire schema of a table, use the `overwrite` method:
1094+
1095+
```python
1096+
from pyiceberg.catalog import load_catalog
1097+
from pyiceberg.schema import Schema
1098+
from pyiceberg.types import NestedField, StringType, DoubleType
1099+
1100+
catalog = load_catalog()
1101+
1102+
initial_schema = Schema(
1103+
NestedField(1, "city_name", StringType(), required=False),
1104+
NestedField(2, "latitude", DoubleType(), required=False),
1105+
NestedField(3, "longitude", DoubleType(), required=False),
1106+
)
1107+
1108+
table = catalog.create_table("default.locations", initial_schema)
1109+
1110+
new_schema = Schema(
1111+
NestedField(1, "city", StringType(), required=False),
1112+
NestedField(2, "lat", DoubleType(), required=False),
1113+
NestedField(3, "long", DoubleType(), required=False),
1114+
NestedField(4, "population", LongType(), required=False),
1115+
)
1116+
1117+
with table.update_schema() as update:
1118+
update.overwrite(new_schema)
1119+
```
1120+
10911121
### Union by Name
10921122

10931123
Using `.union_by_name()` you can merge another schema into an existing schema without having to worry about field-IDs:

pyiceberg/table/update/schema.py

+27-1
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,28 @@ def case_sensitive(self, case_sensitive: bool) -> UpdateSchema:
140140
self._case_sensitive = case_sensitive
141141
return self
142142

143+
def overwrite(self, new_schema: Union[Schema, "pa.Schema"]) -> UpdateSchema:
144+
"""Overwrite the schema with a new schema.
145+
146+
Args:
147+
new_schema: The new schema to overwrite with.
148+
149+
Returns:
150+
This for method chaining.
151+
"""
152+
from pyiceberg.catalog import Catalog
153+
154+
new_schema = Catalog._convert_schema_if_needed(new_schema)
155+
156+
for field in self._schema.fields:
157+
self.delete_column(field.name)
158+
159+
new_schema = Catalog._convert_schema_if_needed(new_schema)
160+
for field in new_schema.fields:
161+
self.add_column(field.name, field.field_type, field.doc, field.required)
162+
163+
return self
164+
143165
def union_by_name(self, new_schema: Union[Schema, "pa.Schema"]) -> UpdateSchema:
144166
from pyiceberg.catalog import Catalog
145167

@@ -153,7 +175,11 @@ def union_by_name(self, new_schema: Union[Schema, "pa.Schema"]) -> UpdateSchema:
153175
return self
154176

155177
def add_column(
156-
self, path: Union[str, Tuple[str, ...]], field_type: IcebergType, doc: Optional[str] = None, required: bool = False
178+
self,
179+
path: Union[str, Tuple[str, ...]],
180+
field_type: IcebergType,
181+
doc: Optional[str] = None,
182+
required: bool = False,
157183
) -> UpdateSchema:
158184
"""Add a new column to a nested struct or Add a new top-level column.
159185

tests/test_schema.py

+36
Original file line numberDiff line numberDiff line change
@@ -1653,3 +1653,39 @@ def test_arrow_schema() -> None:
16531653
)
16541654

16551655
assert base_schema.as_arrow() == expected_schema
1656+
1657+
1658+
def test_overwrite_schema() -> None:
1659+
base_schema = Schema(NestedField(field_id=1, name="old", field_type=StringType(), required=True))
1660+
1661+
expected_schema = Schema(
1662+
NestedField(field_id=1, name="foo", field_type=StringType(), required=True),
1663+
NestedField(field_id=2, name="bar", field_type=IntegerType(), required=False),
1664+
NestedField(field_id=3, name="baz", field_type=BooleanType(), required=False),
1665+
)
1666+
1667+
new_schema = UpdateSchema(transaction=None, schema=base_schema).overwrite(expected_schema)._apply() # type: ignore
1668+
1669+
assert new_schema == expected_schema
1670+
1671+
1672+
def test_overwrite_with_pa_schema() -> None:
1673+
base_schema = Schema(NestedField(field_id=1, name="old", field_type=StringType(), required=True))
1674+
1675+
pa_schema = pa.schema(
1676+
[
1677+
pa.field("foo", pa.string(), nullable=False),
1678+
pa.field("bar", pa.int32(), nullable=True),
1679+
pa.field("baz", pa.bool_(), nullable=True),
1680+
]
1681+
)
1682+
1683+
new_schema = UpdateSchema(transaction=None, schema=base_schema).overwrite(pa_schema)._apply() # type: ignore
1684+
1685+
expected_schema = Schema(
1686+
NestedField(field_id=1, name="foo", field_type=StringType(), required=True),
1687+
NestedField(field_id=2, name="bar", field_type=IntegerType(), required=False),
1688+
NestedField(field_id=3, name="baz", field_type=BooleanType(), required=False),
1689+
)
1690+
1691+
assert new_schema == expected_schema

0 commit comments

Comments
 (0)