-
Notifications
You must be signed in to change notification settings - Fork 58
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add task events to the scheduler #2043
Changes from 22 commits
9b73610
24f68f3
a2041eb
7514691
91042df
014a80c
4aadee8
bd2a146
86167c6
3214048
80663cb
2f36177
36f8246
b0044cc
d1572d2
cf5fd6b
7f3015e
ef2c691
8ad6a5f
eb8b97f
97404b5
b96f51b
d489b48
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
"""Create events table | ||
|
||
Revision ID: 0008 | ||
Revises: 0007 | ||
Create Date: 2023-11-14 15:00:00.000000 | ||
|
||
""" | ||
import sqlalchemy as sa | ||
from alembic import op | ||
from sqlalchemy.dialects import postgresql | ||
|
||
import scheduler | ||
|
||
# revision identifiers, used by Alembic. | ||
revision = "0008" | ||
down_revision = "0007" | ||
branch_labels = None | ||
depends_on = None | ||
|
||
|
||
def upgrade(): | ||
# Add events table | ||
op.create_table( | ||
"events", | ||
sa.Column("id", sa.Integer(), nullable=False, autoincrement=True), | ||
sa.Column("task_id", scheduler.utils.datastore.GUID(), nullable=True), | ||
sa.Column("type", sa.String(), nullable=True), | ||
sa.Column("context", sa.String(), nullable=True), | ||
sa.Column("event", sa.String(), nullable=True), | ||
sa.Column("timestamp", sa.DateTime(timezone=True), nullable=False, server_default=sa.text("now()")), | ||
sa.Column("data", postgresql.JSONB(astext_type=sa.Text()), nullable=True), | ||
sa.PrimaryKeyConstraint("id"), | ||
) | ||
|
||
op.create_index(op.f("ix_events_task_id"), "events", ["task_id"], unique=False) | ||
|
||
|
||
def downgrade(): | ||
# Drop the events table | ||
op.drop_table("events") |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,11 @@ | ||
from .base import Base | ||
from .boefje import Boefje, BoefjeMeta | ||
from .events import RawData, RawDataReceivedEvent | ||
from .events import Event, EventDB, RawData, RawDataReceivedEvent | ||
from .health import ServiceHealth | ||
from .normalizer import Normalizer | ||
from .ooi import OOI, MutationOperationType, ScanProfile, ScanProfileMutation | ||
from .organisation import Organisation | ||
from .plugin import Plugin | ||
from .queue import PrioritizedItem, PrioritizedItemDB, Queue | ||
from .scheduler import Scheduler | ||
from .tasks import BoefjeTask, NormalizerTask, Task, TaskDB, TaskStatus | ||
from .tasks import BoefjeTask, NormalizerTask, Task, TaskDB, TaskList, TaskStatus |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,60 @@ | ||
from datetime import datetime | ||
import uuid | ||
from datetime import datetime, timezone | ||
|
||
from pydantic import BaseModel | ||
from pydantic import BaseModel, ConfigDict, Field | ||
from sqlalchemy import Column, DateTime, Integer, String | ||
from sqlalchemy.dialects.postgresql import JSONB | ||
from sqlalchemy.schema import Index | ||
from sqlalchemy.sql import func | ||
|
||
from scheduler.utils import GUID | ||
|
||
from .base import Base | ||
from .raw_data import RawData | ||
|
||
|
||
class RawDataReceivedEvent(BaseModel): | ||
created_at: datetime | ||
organization: str | ||
raw_data: RawData | ||
|
||
|
||
class Event(BaseModel): | ||
model_config = ConfigDict(from_attributes=True) | ||
|
||
task_id: uuid.UUID | ||
|
||
type: str | ||
|
||
context: str | ||
|
||
event: str | ||
|
||
timestamp: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) | ||
|
||
data: dict | ||
|
||
|
||
class EventDB(Base): | ||
__tablename__ = "events" | ||
|
||
id = Column(Integer, primary_key=True) | ||
|
||
task_id = Column(GUID) | ||
|
||
type = Column(String) | ||
|
||
context = Column(String) | ||
|
||
event = Column(String) | ||
|
||
timestamp = Column(DateTime(timezone=True), nullable=False, server_default=func.now()) | ||
|
||
data = Column(JSONB, nullable=False) | ||
|
||
__table_args__ = ( | ||
Index( | ||
"ix_events_task_id", | ||
task_id, | ||
), | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,6 +7,7 @@ | |
from pydantic import BaseModel, ConfigDict, Field | ||
from sqlalchemy import Column, DateTime, Enum, String | ||
from sqlalchemy.dialects.postgresql import JSONB | ||
from sqlalchemy.ext.hybrid import hybrid_property | ||
from sqlalchemy.schema import Index | ||
from sqlalchemy.sql import func | ||
from sqlalchemy.sql.expression import text | ||
|
@@ -44,6 +45,27 @@ class TaskStatus(str, enum.Enum): | |
CANCELLED = "cancelled" | ||
|
||
|
||
class TaskList(BaseModel): | ||
model_config = ConfigDict(from_attributes=True) | ||
|
||
id: uuid.UUID | ||
|
||
scheduler_id: str | ||
|
||
type: str | ||
|
||
p_item: PrioritizedItem | ||
|
||
status: TaskStatus | ||
|
||
created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) | ||
|
||
modified_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) | ||
|
||
def __repr__(self): | ||
return f"Task(id={self.id}, scheduler_id={self.scheduler_id}, type={self.type}, status={self.status})" | ||
|
||
|
||
class Task(BaseModel): | ||
model_config = ConfigDict(from_attributes=True) | ||
|
||
|
@@ -57,13 +79,22 @@ class Task(BaseModel): | |
|
||
status: TaskStatus | ||
|
||
duration: Optional[float] = Field(None, alias="duration", readonly=True) | ||
|
||
queued: Optional[float] = Field(None, alieas="queued", readonly=True) | ||
|
||
runtime: Optional[float] = Field(None, alias="runtime", readonly=True) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Type should be |
||
|
||
created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) | ||
|
||
modified_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) | ||
|
||
def __repr__(self): | ||
return f"Task(id={self.id}, scheduler_id={self.scheduler_id}, type={self.type}, status={self.status})" | ||
|
||
def model_dump_db(self): | ||
return self.model_dump(exclude={"duration", "queued", "runtime"}) | ||
|
||
|
||
class TaskDB(Base): | ||
__tablename__ = "tasks" | ||
|
@@ -103,6 +134,33 @@ class TaskDB(Base): | |
), | ||
) | ||
|
||
_event_store = None | ||
|
||
@classmethod | ||
def set_event_store(cls, event_store): | ||
cls._event_store = event_store | ||
|
||
@hybrid_property | ||
def duration(self) -> float: | ||
if self._event_store is None: | ||
raise ValueError("EventStore instance is not set. Use TaskDB.set_event_store to set it.") | ||
|
||
return self._event_store.get_task_duration(self.id) | ||
|
||
@hybrid_property | ||
def queued(self) -> float: | ||
if self._event_store is None: | ||
raise ValueError("EventStore instance is not set. Use TaskDB.set_event_store to set it.") | ||
|
||
return self._event_store.get_task_queued(self.id) | ||
|
||
@hybrid_property | ||
def runtime(self) -> float: | ||
if self._event_store is None: | ||
raise ValueError("EventStore instance is not set. Use TaskDB.set_event_store to set it.") | ||
|
||
return self._event_store.get_task_runtime(self.id) | ||
|
||
|
||
class NormalizerTask(BaseModel): | ||
"""NormalizerTask represent data needed for a Normalizer to run.""" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -89,6 +89,8 @@ def post_push(self, p_item: models.PrioritizedItem) -> None: | |
Args: | ||
p_item: The prioritized item from the priority queue. | ||
""" | ||
# Create task | ||
# | ||
# NOTE: we set the id of the task the same as the p_item, for easier | ||
# lookup. | ||
task = models.Task( | ||
|
@@ -101,16 +103,28 @@ def post_push(self, p_item: models.PrioritizedItem) -> None: | |
modified_at=datetime.now(timezone.utc), | ||
) | ||
|
||
# Create event | ||
event = models.Event( | ||
task_id=task.id, | ||
type="events.db", | ||
context="task", | ||
event="insert", | ||
data=task.model_dump(), | ||
) | ||
|
||
task_db = self.ctx.datastores.task_store.get_task_by_id(str(p_item.id)) | ||
if task_db is not None: | ||
event.event = "update" | ||
self.ctx.datastores.task_store.update_task(task) | ||
self.ctx.datastores.event_store.create_event(event) | ||
return | ||
|
||
self.ctx.datastores.task_store.create_task(task) | ||
self.ctx.datastores.event_store.create_event(event) | ||
|
||
def post_pop(self, p_item: models.PrioritizedItem) -> None: | ||
"""When a boefje task is being removed from the queue. We | ||
persist a task to the datastore with the status RUNNING | ||
persist a task to the datastore with the status DISPATCHED. | ||
|
||
Args: | ||
p_item: The prioritized item from the priority queue. | ||
|
@@ -127,10 +141,20 @@ def post_pop(self, p_item: models.PrioritizedItem) -> None: | |
) | ||
return None | ||
|
||
# Update task | ||
task.status = models.TaskStatus.DISPATCHED | ||
self.ctx.datastores.task_store.update_task(task) | ||
|
||
return None | ||
# Create event | ||
event = models.Event( | ||
task_id=task.id, | ||
type="events.db", | ||
context="task", | ||
event="update", | ||
data=task.model_dump(), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This will result in a very big event table, because the task model is pretty big already and this will be duplicated unnecessarily multiple times in the events table. I don't think this is a good idea with regards to performance and resource usage. |
||
) | ||
|
||
self.ctx.datastores.event_store.create_event(event) | ||
|
||
def pop_item_from_queue( | ||
self, filters: Optional[storage.filters.FilterRequest] = None | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This should be a foreign key to the task table.