Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Corruption Fixer #124

Merged
merged 8 commits into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions alembic/versions/1def8c988372_add_librarian_transfer_toggling.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,25 @@ def upgrade():
"corrupt_files",
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("file_name", sa.String(), nullable=False),
sa.Column("file_source", sa.String(), nullable=False),
sa.Column("instance_id", sa.Integer(), nullable=False),
sa.Column("instance_path", sa.String(), nullable=False),
sa.Column("corrupt_time", sa.DateTime(), nullable=False),
sa.Column("size", sa.BigInteger(), nullable=False),
sa.Column("checksum", sa.String(), nullable=False),
sa.Column("count", sa.Integer(), nullable=False),
sa.Column("replacement_requested", sa.Boolean(), nullable=False),
sa.Column("incoming_transfer_id", sa.Integer(), nullable=True),
sa.PrimaryKeyConstraint("id"),
)

with op.batch_alter_table("outgoing_transfers") as batch_op:
batch_op.alter_column("file_name", nullable=True)


def downgrade():
op.drop_column("librarians", "transfers_enabled")
op.drop_table("corrupt_files")

with op.batch_alter_table("outgoing_transfers") as batch_op:
batch_op.alter_column("file_name", nullable=False)
43 changes: 43 additions & 0 deletions docs/source/Background.rst
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,26 @@ The following background tasks are available:
This task is configured with the following additional parameters:

* ``age_in_days``: The number of days back to check for files to transfer (integer).
- ``duplicate_remote_instance_hypervisor``: A hypervisor that looks for duplicate remote
instance rows int he table and removes one. This ensures database integrity and that
the count of remote instances per librarian and store corresponds to the number of files
on that librarian.
- ``rolling_deletion``: A task to delete data that was ingested into the librarian
more than ``age_in_days`` ago. Note that this does not delete them from the entire network,
and has specific tools to ensure other copies exist in the network elsewhere:

* ``store_name``: The store to delete instances from
* ``age_in_days``: The number of days old data needs to be to be considered for deletion
* ``number_of_remote_copies``: The number of copies in the rest of the network (which are
validated using checksumming) that must be kept before deleting a local instance.
* ``verifiy_downstream_checksums``: Whether to make sure all downstream checksums that were
computed on request match the underlying data before deletion (True).
* ``mark_unavailable``: Whether to mark instances as unavailable (True) or actually remove the
rows in the table (False). Default True.
* ``force_deletion``: Whether to ignore the legacy DeletionPolicy parameter (True).
- ``corruption_fixer``: A task that reaches out to upstream librarians to ask for new copies of
corrupt files in the table. These corrupt files can be found by the ``check_integrity`` task
or when upstreams validate files during the deletion process.


Background Task Configuration Examples
Expand Down Expand Up @@ -198,6 +218,22 @@ store. The destination librarian is called ``destination``.
"every": "01:00:00",
"age_in_days": 2
}
],
"duplicate_remote_instance_hypervisor": [
{
"task_name": "Duplicate RI hypervisor",
"soft_timeout": "00:30:00",
"every": "24:00:00"
}
],
"rolling_deletion": [
{
"task_name": "Storage Recovery",
"soft_timeout": "00:30:00",
"every": "24:00:00",
"store_name": "store",
"number_of_remote_copies": 2
}
]
}

Expand Down Expand Up @@ -229,5 +265,12 @@ deleted from the store.
"every": "01:00:00",
"age_in_days": 2
}
],
"corruption_fixer": [
{
"task_name": "Corruption fixer",
"soft_timeout": "00:30:00",
"every": "24:00:00"
}
]
}
24 changes: 24 additions & 0 deletions hera_librarian/models/corrupt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""
Models for the corruption fixing endpoints.
"""

from pydantic import BaseModel


class CorruptionPreparationRequest(BaseModel):
file_name: str
librarian_name: str


class CorruptionPreparationResponse(BaseModel):
ready: bool


class CorruptionResendRequest(BaseModel):
librarian_name: str
file_name: str


class CorruptionResendResponse(BaseModel):
success: bool
destination_transfer_id: int
1 change: 1 addition & 0 deletions librarian_background/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def background(run_once: bool = False):
+ background_settings.incoming_transfer_hypervisor
+ background_settings.duplicate_remote_instance_hypervisor
+ background_settings.rolling_deletion
+ background_settings.corruption_fixer
)

for task in all_tasks:
Expand Down
Loading
Loading