Skip to content

Add model config checks + fix tree views #29

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 22 additions & 15 deletions docs/configuring.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ others at the field level. The general structure of the configuration dict is th
}
```

!!! tip
Table names and column names in the config dict (`table1` and `my_column` in the above example) refer to the names before
any transformation. They refer to the names that can be found in `DataModel.source_tree`.

## Model configuration

The following options can be passed as a top-level keys of the model configuration `dict`:
Expand Down Expand Up @@ -147,20 +151,7 @@ automatically applied `join`, as it would require a complex process of adding a

### Elevate children to upper level

If a complex child element has a minimum and maximum occurrences number of 1 and 1 respectively, it can be "pulled" up
to its parent element. This behaviour will always be applied by default.

If a complex child element has a minimum and maximum occurrences number of 0 and 1 respectively, it can also be "pulled"
up to its parent element fields. This is applied by default if the child has less than 5 fields, because otherwise it
could clutter the parent element with many columns that will often be all `NULL`.

This simplification can be opted out using a configuration option, and forced in the case of a child with more than 5
fields, using the following option:

`"transform":` `"elevate"` (default) or `"elevate_wo_prefix"` or `False` (disable).

By default, the elevated field name is prefixed with the name of the complex child so its origin is clear and to prevent
duplicated names, but this prefixing can be avoided with the value `"elevate_wo_prefix"`.
If a complex child element has a maximum occurrences number of 1, it can be "pulled" up to its parent element.

For example, complex child `timeInterval` with 2 fields of max occurrence 1, before elevation...
```shell
Expand All @@ -170,13 +161,29 @@ timeInterval[1, 1]:
end[1, 1]: string
```

... and after elevation (with prefix):
... and after elevation:
```shell
# Parent fields
timeInterval_start[1, 1]: string
timeInterval_end[1, 1]: string
```

The resulting name concatenate the parent name and the child name, by default.

This transformation will be applied by default when:

* the minimum occurrence number is 1
* or the minimum occurence number is 0 and the child has less than 5 fields, because otherwise it could clutter the parent
element with many columns that will often be all `NULL`.

This can be configured by the `"transform"` option, with the following values:

* `None`: default behaviour,
* `"elevate"`: force the elevation of the child,
* `"elevate_wo_prefix"`: force the elevation of the child, but drop the name of the parent element in the resulting
column name,
* `False`: disable.

!!! example
Force "elevation" of a complex type to its parent:
``` python
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ dependencies = [
"sqlalchemy>1.4",
"xmlschema>=3.3.2",
"lxml>=5.1.0",
"typing-extensions>=4; python_version<'3.11'"
]

[project.optional-dependencies]
Expand Down
2 changes: 1 addition & 1 deletion src/xml2db/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .model import DataModel
from .document import Document
from .model import DataModel
from .table import (
DataModelTable,
DataModelTableReused,
Expand Down
3 changes: 2 additions & 1 deletion src/xml2db/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@
from io import BytesIO
from typing import Union, TYPE_CHECKING
from zoneinfo import ZoneInfo

from lxml import etree
from sqlalchemy import Column, Table, text, select
from sqlalchemy.engine import Connection
from sqlalchemy.sql.expression import TextClause
from lxml import etree

if TYPE_CHECKING:
from .model import DataModel
Expand Down
14 changes: 0 additions & 14 deletions src/xml2db/exceptions.py

This file was deleted.

115 changes: 46 additions & 69 deletions src/xml2db/model.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,25 @@
import logging
import os
from datetime import datetime
from graphlib import TopologicalSorter
from io import BytesIO
from typing import Iterable, Union
from uuid import uuid4
import hashlib

import xmlschema
import sqlalchemy
import xmlschema
from lxml import etree
from sqlalchemy import MetaData, create_engine, inspect
from sqlalchemy.sql.ddl import CreateIndex, CreateTable
from sqlalchemy.exc import ProgrammingError
from graphlib import TopologicalSorter
from sqlalchemy.sql.ddl import CreateIndex, CreateTable

from .document import Document
from .exceptions import DataModelConfigError, check_type
from .model_config import (
DataModelConfigError,
DataModelConfigType,
TableConfigType,
validate_model_config,
)
from .table import (
DataModelTableReused,
DataModelTableDuplicated,
Expand Down Expand Up @@ -73,28 +77,13 @@ def __init__(
short_name: str = "DocumentRoot",
long_name: str = None,
base_url: str = None,
model_config: dict = None,
model_config: DataModelConfigType = None,
connection_string: str = None,
db_engine: sqlalchemy.Engine = None,
db_type: str = None,
db_schema: str = None,
temp_prefix: str = None,
):
self.model_config = self._validate_config(model_config)
self.tables_config = model_config.get("tables", {}) if model_config else {}

xsd_file_name = xsd_file
if base_url is None:
base_url = os.path.normpath(os.path.dirname(xsd_file))
xsd_file_name = os.path.basename(xsd_file)

self.xml_schema = xmlschema.XMLSchema(xsd_file_name, base_url=base_url)
self.lxml_schema = etree.XMLSchema(etree.parse(xsd_file))

self.xml_converter = XMLConverter(data_model=self)
self.data_flow_name = short_name
self.data_flow_long_name = long_name

if connection_string is None and db_engine is None:
logger.warning(
"DataModel created without connection string cannot do actual imports"
Expand All @@ -117,6 +106,20 @@ def __init__(
)
self.db_type = self.engine.dialect.name

self.model_config = validate_model_config(model_config, self.db_type)

xsd_file_name = xsd_file
if base_url is None:
base_url = os.path.normpath(os.path.dirname(xsd_file))
xsd_file_name = os.path.basename(xsd_file)

self.xml_schema = xmlschema.XMLSchema(xsd_file_name, base_url=base_url)
self.lxml_schema = etree.XMLSchema(etree.parse(xsd_file))

self.xml_converter = XMLConverter(data_model=self)
self.data_flow_name = short_name
self.data_flow_long_name = long_name

self.db_schema = db_schema
self.temp_prefix = str(uuid4())[:8] if temp_prefix is None else temp_prefix

Expand All @@ -135,30 +138,6 @@ def __init__(

self._build_model()

def _validate_config(self, cfg):
if cfg is None:
cfg = {}
model_config = {
key: check_type(cfg, key, exp_type, default)
for key, exp_type, default in [
("as_columnstore", bool, False),
("row_numbers", bool, False),
("document_tree_hook", callable, None),
("document_tree_node_hook", callable, None),
("record_hash_column_name", str, "xml2db_record_hash"),
("record_hash_constructor", callable, hashlib.sha1),
("record_hash_size", int, 20),
("metadata_columns", list, []),
]
}
if model_config["as_columnstore"] and self.db_type == "mssql":
model_config["as_columnstore"] = False
logger.info(
"Clustered columnstore indexes are only supported with MS SQL Server database, noop"
)

return model_config

@property
def fk_ordered_tables(
self,
Expand All @@ -179,6 +158,7 @@ def _create_table_model(
self,
table_name: str,
type_name: str,
table_config: TableConfigType,
is_root_table: bool = False,
is_virtual_node: bool = False,
) -> Union[DataModelTableReused, DataModelTableDuplicated]:
Expand All @@ -187,13 +167,13 @@ def _create_table_model(
Args:
table_name: name of the table
type_name: type of the table
table_config: dict config for the table
is_root_table: is this table the root table?
is_virtual_node: was this table created to store multiple root elements?

Returns:
A data model instance.
"""
table_config = self.tables_config.get(table_name, {})
if table_config.get("reuse", True):
return DataModelTableReused(
table_name,
Expand Down Expand Up @@ -230,7 +210,7 @@ def _build_model(self):
)
self.root_table = root_table.type_name
# compute a text representation of the original data model and store it
self.source_tree = "\n".join(self._repr_tree(root_table))
self.source_tree = str(root_table)
# check user-provided configuration for tables
for tb_config in self.model_config.get("tables", {}):
if tb_config not in self.names_types_map:
Expand All @@ -245,7 +225,7 @@ def _build_model(self):
key: tb for key, tb in self.tables.items() if hasattr(tb, "keep_table")
}
# compute a text representation of the simplified data model and store it
self.target_tree = "\n".join(self._repr_tree(root_table))
self.target_tree = str(root_table)
# add parent table information on each table when it is not reused
# raises an error if a table is not configured as "reused" and have more than 1 parent table
for tb in self.tables.values():
Expand Down Expand Up @@ -319,12 +299,30 @@ def _parse_tree(self, parent_node: xmlschema.XsdElement, nodes_path: list = None
while "_".join([parent_name, str(i)]) in self.names_types_map:
i += 1
parent_name = "_".join([parent_name, str(i)])

table_config = self.model_config["tables"].get(parent_name, {})

# validate fields config (raise if useless config is provided)
children_names = {child.local_name for child in parent_node}
attributes_names = set(parent_node.attributes.keys())
if "product21" in table_config.get("fields", {}):
print("ok")
unused_fields_config = set(
table_config.get("fields", {}).keys()
) - children_names.union(attributes_names)
if len(unused_fields_config) > 0:
raise DataModelConfigError(
f"config provided for field '{unused_fields_config.pop()}' for table '{parent_name}'"
" while this field does not exist in this table"
)

self.names_types_map[parent_name] = parent_type

# create a new table object associated with the element
parent_table = self._create_table_model(
parent_name,
parent_type,
table_config,
len(nodes_path) == 1,
isinstance(parent_node, xmlschema.XMLSchema),
)
Expand Down Expand Up @@ -424,10 +422,7 @@ def get_occurs(particle):
]

# go through item attributes and add them as columns, adding a suffix if an element with the same name exists
children_names = None
for attrib_name, attrib in parent_node.attributes.items():
if children_names is None:
children_names = [child.local_name for child in parent_node]
(
data_type,
min_length,
Expand Down Expand Up @@ -569,24 +564,6 @@ def get_occurs(particle):

return parent_table

def _repr_tree(
self,
parent_table: Union[DataModelTableReused, DataModelTableDuplicated],
):
"""Build a text representation of the data model tree

Args:
parent_table: the current data model table object
"""
for field_type, name, field in parent_table.fields:
if field_type == "col":
yield f"{field.name}{field.occurs}: {field.data_type}"
else:
mg = " (choice)" if field.other_table.model_group == "choice" else ""
yield f"{field.name}{field.occurs}{mg}:"
for line in self._repr_tree(field.other_table):
yield f" {line}"

def get_entity_rel_diagram(self, text_context: bool = True) -> str:
"""Build an entity relationship diagram for the data model

Expand Down
Loading