Skip to content

Commit

Permalink
feat: adapt dataset selectionto constants mecanism
Browse files Browse the repository at this point in the history
  • Loading branch information
edelclaux committed Jan 2, 2025
1 parent 283e01a commit 5bc9fcb
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 100 deletions.
26 changes: 13 additions & 13 deletions backend/geonature/core/gn_synthese/imports/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ def check_transient_data(task, logger, imprt: TImports):
field_name: fields[field_name]
for field_name, source_field in imprt.fieldmapping.items()
if source_field.get("column_src", None) in imprt.columns
or source_field.get("default_value", None) is not None
}
init_rows_validity(imprt)
task.update_state(state="PROGRESS", meta={"progress": 0.05})
Expand Down Expand Up @@ -246,7 +247,6 @@ def update_batch_progress(batch, step):
entity,
fields["id_nomenclature_blurring"],
fields["id_dataset"],
fields["unique_dataset_id"],
)
if current_app.config["IMPORT"]["CHECK_REF_BIBLIO_LITTERATURE"]:
check_nomenclature_source_status(
Expand Down Expand Up @@ -275,16 +275,20 @@ def update_batch_progress(batch, step):

if "unique_id_sinp" in selected_fields:
check_duplicate_uuid(imprt, entity, selected_fields["unique_id_sinp"])
# TODO: qu'est-ce que c'est ?
if current_app.config["IMPORT"]["PER_DATASET_UUID_CHECK"]:
whereclause = Synthese.id_dataset == imprt.id_dataset
check_existing_uuid(
imprt,
entity,
selected_fields["unique_id_sinp"],
id_dataset_field=selected_fields["id_dataset"],
)
else:
whereclause = sa.true()
check_existing_uuid(
imprt,
entity,
selected_fields["unique_id_sinp"],
whereclause=whereclause,
)
check_existing_uuid(
imprt,
entity,
selected_fields["unique_id_sinp"],
)
if imprt.fieldmapping.get(
"unique_id_sinp_generate",
current_app.config["IMPORT"]["DEFAULT_GENERATE_MISSING_UUID"],
Expand Down Expand Up @@ -358,14 +362,11 @@ def import_data_to_destination(imprt: TImports) -> None:
):
insert_fields |= {field}

insert_fields -= {fields["unique_dataset_id"]} # Column only used for filling `id_dataset`

select_stmt = (
sa.select(
*[transient_table.c[field.dest_field] for field in insert_fields],
sa.literal(source.id_source),
sa.literal(source.module.id_module),
sa.literal(imprt.id_dataset),
sa.literal(imprt.id_import),
sa.literal("I"),
)
Expand All @@ -375,7 +376,6 @@ def import_data_to_destination(imprt: TImports) -> None:
names = [field.dest_field for field in insert_fields] + [
"id_source",
"id_module",
"id_dataset",
"id_import",
"last_action",
]
Expand Down
119 changes: 58 additions & 61 deletions backend/geonature/core/imports/checks/dataframe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ def check_datasets(
) -> Set[str]:
"""
Check if datasets exist and are authorized for the user and import.
It also fill the id_field based on the content of uuid_field
Parameters
----------
imprt : TImports
Expand Down Expand Up @@ -222,72 +222,69 @@ def check_datasets(
"""
updated_cols = set()
uuid_col = uuid_field.dest_field
id_col = id_field.dest_field

if uuid_col in df:
has_uuid_mask = df[uuid_col].notnull()
uuid = df.loc[has_uuid_mask, uuid_col].unique().tolist()
uuid_col = uuid_field.source_column

datasets = {
str(ds.unique_dataset_id): ds
for ds in TDatasets.query.filter(TDatasets.unique_dataset_id.in_(uuid))
.options(sa.orm.joinedload(TDatasets.nomenclature_data_origin))
.options(sa.orm.raiseload("*"))
.all()
if uuid_col not in df:
yield {
"error_code": ImportCodeError.MISSING_VALUE,
"column": uuid_field.name_field,
}
valid_ds_mask = df[uuid_col].isin(datasets.keys())
invalid_ds_mask = has_uuid_mask & ~valid_ds_mask
if invalid_ds_mask.any():
yield {
"error_code": ImportCodeError.DATASET_NOT_FOUND,
"column": uuid_field.name_field,
"invalid_rows": df[invalid_ds_mask],
}

inactive_dataset = [uuid for uuid, ds in datasets.items() if not ds.active]
inactive_dataset_mask = df[uuid_col].isin(inactive_dataset)
if inactive_dataset_mask.any():
yield {
"error_code": ImportCodeError.DATASET_NOT_ACTIVE,
"column": uuid_field.name_field,
"invalid_rows": df[inactive_dataset_mask],
}

# Warning: we check only permissions of first author, but currently there it only one author per import.
authorized_datasets = {
str(ds.unique_dataset_id): ds
for ds in db.session.execute(
TDatasets.filter_by_creatable(
user=imprt.authors[0], module_code=module_code, object_code=object_code
)
.where(TDatasets.unique_dataset_id.in_(uuid))
.options(sa.orm.raiseload("*"))
)
.scalars()
.all()
uuid = df[uuid_col].unique().tolist()

datasets = {
str(ds.unique_dataset_id): ds
for ds in TDatasets.query.filter(TDatasets.unique_dataset_id.in_(uuid))
.options(sa.orm.joinedload(TDatasets.nomenclature_data_origin))
.options(sa.orm.raiseload("*"))
.all()
}
valid_ds_mask = df[uuid_col].isin(datasets.keys())
invalid_ds_mask = ~valid_ds_mask
if invalid_ds_mask.any():
yield {
"error_code": ImportCodeError.DATASET_NOT_FOUND,
"column": uuid_field.name_field,
"invalid_rows": df[invalid_ds_mask],
}
authorized_ds_mask = df[uuid_col].isin(authorized_datasets.keys())
unauthorized_ds_mask = valid_ds_mask & ~authorized_ds_mask
if unauthorized_ds_mask.any():
yield {
"error_code": ImportCodeError.DATASET_NOT_AUTHORIZED,
"column": uuid_field.name_field,
"invalid_rows": df[unauthorized_ds_mask],
}

if authorized_ds_mask.any():
df.loc[authorized_ds_mask, id_col] = df[authorized_ds_mask][uuid_col].apply(
lambda uuid: authorized_datasets[uuid].id_dataset
)
updated_cols = {id_col}

else:
has_uuid_mask = pd.Series(False, index=df.index)
inactive_dataset = [uuid for uuid, ds in datasets.items() if not ds.active]
inactive_dataset_mask = df[uuid_col].isin(inactive_dataset)
if inactive_dataset_mask.any():
yield {
"error_code": ImportCodeError.DATASET_NOT_ACTIVE,
"column": uuid_field.name_field,
"invalid_rows": df[inactive_dataset_mask],
}

if (~has_uuid_mask).any():
# Set id_dataset from import for empty cells:
df.loc[~has_uuid_mask, id_col] = imprt.id_dataset
# Warning: we check only permissions of first author, but currently there it only one author per import.
authorized_datasets = {
str(ds.unique_dataset_id): ds
for ds in db.session.execute(
TDatasets.filter_by_creatable(
user=imprt.authors[0], module_code=module_code, object_code=object_code
)
.where(TDatasets.unique_dataset_id.in_(uuid))
.options(sa.orm.raiseload("*"))
)
.scalars()
.all()
}
authorized_ds_mask = df[uuid_col].isin(authorized_datasets.keys())
unauthorized_ds_mask = valid_ds_mask & ~authorized_ds_mask
if unauthorized_ds_mask.any():
yield {
"error_code": ImportCodeError.DATASET_NOT_AUTHORIZED,
"column": uuid_field.name_field,
"invalid_rows": df[unauthorized_ds_mask],
}

# compute id_col based on uuid_col
if authorized_ds_mask.any():
id_col = id_field.dest_field
df.loc[authorized_ds_mask, id_col] = df[authorized_ds_mask][uuid_col].apply(
lambda uuid: authorized_datasets[uuid].id_dataset
)
updated_cols = {id_col}

return updated_cols
9 changes: 2 additions & 7 deletions backend/geonature/core/imports/checks/sql/nomenclature.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,9 +176,7 @@ def check_nomenclature_exist_proof(
)


def check_nomenclature_blurring(
imprt, entity, blurring_field, id_dataset_field, uuid_dataset_field
):
def check_nomenclature_blurring(imprt, entity, blurring_field, id_dataset_field):
"""
Raise an error if blurring not set.
Required if the dataset is private.
Expand All @@ -196,10 +194,7 @@ def check_nomenclature_blurring(
error_type=ImportCodeError.CONDITIONAL_MANDATORY_FIELD_ERROR,
error_column=blurring_field.name_field,
whereclause=sa.and_(
sa.or_(
transient_table.c[id_dataset_field.name_field] == TDatasets.id_dataset,
transient_table.c[uuid_dataset_field.name_field] == TDatasets.unique_dataset_id,
),
transient_table.c[id_dataset_field.name_field] == TDatasets.id_dataset,
TDatasets.id_nomenclature_data_origin == id_nomenclature_private,
transient_table.c[blurring_field.dest_field] == None,
),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,37 +5,37 @@
Create Date: 2024-12-17 11:18:07.806852
"""

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision = 'a43842db7ac1'
down_revision = '2b0b3bd0248c'
revision = "a43842db7ac1"
down_revision = "2b0b3bd0248c"
branch_labels = None
depends_on = None


def upgrade():
op.execute(
"""
UPDATE gn_imports.bib_fields
SET display = TRUE,
mandatory = TRUE,
optional_conditions = '{"unique_dataset_id"}',
type_field = 'dataset',
type_field_params = '{"bind_value": "id_dataset"}'
WHERE name_field = 'id_dataset'
type_field_params = '{"bind_value": "unique_dataset_id"}',
source_field = 'src_unique_dataset_id',
dest_field = NULL
WHERE name_field = 'unique_dataset_id'
"""
)
op.execute(
"""
UPDATE gn_imports.bib_fields
SET display = TRUE,
mandatory = TRUE,
optional_conditions = '{"id_dataset"}',
type_field = 'dataset',
type_field_params = '{"bind_value": "unique_dataset_id"}'
WHERE name_field = 'unique_dataset_id'
SET dest_field = 'id_dataset',
source_field = NULL
WHERE name_field = 'id_dataset'
"""
)

Expand All @@ -48,18 +48,15 @@ def downgrade():
mandatory = FALSE,
optional_conditions = NULL,
type_field = 'text',
type_field_params = NULL
WHERE name_field = 'id_dataset'
type_field_params = NULL,
dest_field = 'unique_dataset_id'
WHERE name_field = 'unique_dataset_id'
"""
)
op.execute(
"""
UPDATE gn_imports.bib_fields
SET display = FALSE,
mandatory = FALSE,
optional_conditions = NULL,
type_field = 'text',
type_field_params = NULL
WHERE name_field = 'unique_dataset_id'
SET dest_field = NULL
WHERE name_field = 'id_dataset'
"""
)

0 comments on commit 5bc9fcb

Please sign in to comment.