diff --git a/oonidata/src/oonidata/models/analysis.py b/oonidata/src/oonidata/models/analysis.py deleted file mode 100644 index 2906bbc3..00000000 --- a/oonidata/src/oonidata/models/analysis.py +++ /dev/null @@ -1,132 +0,0 @@ -from dataclasses import dataclass -import dataclasses -from datetime import datetime -from typing import List, Optional - -from .base import table_model -from oonidata.models.observations import MeasurementMeta, ProbeMeta - - -@table_model( - table_name="obs_web_analysis", - table_index=( - "analysis_id", - "measurement_uid", - "observation_id", - "measurement_start_time", - ), -) -@dataclass -class WebAnalysis: - probe_meta: ProbeMeta - measurement_meta: MeasurementMeta - - analysis_id: str - observation_id: str - - created_at: datetime - - # This is the domain name associated with the target, for example for - # facebook it will be www.facebook.com, but also edge-mqtt.facebook.com - target_domain_name: str - # This is the more granular level associated with a target, for example the IP, port tuple - target_detail: str - - # dns_ground_truth_nxdomain_cc_asn: Optional[set] = None - # dns_ground_truth_failure_cc_asn: Optional[set] = None - # dns_ground_truth_ok_cc_asn: Optional[set] = None - # dns_ground_truth_other_ips: Optional[Dict[str, set]] = None - # dns_ground_truth_other_asns: Optional[Dict[str, set]] = None - # dns_ground_truth_trusted_answers: Optional[Dict] = None - dns_ground_truth_nxdomain_count: Optional[int] = None - dns_ground_truth_failure_count: Optional[int] = None - dns_ground_truth_ok_count: Optional[int] = None - dns_ground_truth_ok_cc_asn_count: Optional[int] = None - dns_ground_truth_failure_cc_asn_count: Optional[int] = None - dns_ground_truth_nxdomain_cc_asn_count: Optional[int] = None - dns_consistency_system_answers: List[str] = dataclasses.field(default_factory=list) - dns_consistency_system_success: Optional[bool] = None - dns_consistency_system_failure: Optional[str] = None - dns_consistency_system_answer_count: Optional[int] = None - dns_consistency_system_is_answer_tls_consistent: Optional[bool] = None - dns_consistency_system_is_answer_tls_inconsistent: Optional[bool] = None - dns_consistency_system_is_answer_ip_in_trusted_answers: Optional[bool] = None - dns_consistency_system_is_answer_asn_in_trusted_answers: Optional[bool] = None - dns_consistency_system_is_answer_asorg_in_trusted_answers: Optional[bool] = None - dns_consistency_system_is_answer_cloud_provider: Optional[bool] = None - dns_consistency_system_is_answer_probe_asn_match: Optional[bool] = None - dns_consistency_system_is_answer_probe_cc_match: Optional[bool] = None - dns_consistency_system_is_answer_bogon: Optional[bool] = None - dns_consistency_system_answer_fp_name: Optional[str] = None - dns_consistency_system_answer_fp_scope: Optional[str] = None - dns_consistency_system_is_answer_fp_match: Optional[bool] = None - dns_consistency_system_is_answer_fp_country_consistent: Optional[bool] = None - dns_consistency_system_is_answer_fp_false_positive: Optional[bool] = None - dns_consistency_system_is_resolver_probe_asn_match: Optional[bool] = None - dns_consistency_system_is_resolver_probe_cc_match: Optional[bool] = None - dns_consistency_system_answer_ip_ground_truth_asn_count: Optional[int] = None - dns_consistency_system_answer_asn_ground_truth_asn_count: Optional[int] = None - dns_consistency_other_answers: List[str] = dataclasses.field(default_factory=list) - dns_consistency_other_success: Optional[bool] = None - dns_consistency_other_failure: Optional[str] = None - dns_consistency_other_answer_count: Optional[int] = None - dns_consistency_other_is_answer_tls_consistent: Optional[bool] = None - dns_consistency_other_is_answer_tls_inconsistent: Optional[bool] = None - dns_consistency_other_is_answer_ip_in_trusted_answers: Optional[bool] = None - dns_consistency_other_is_answer_asn_in_trusted_answers: Optional[bool] = None - dns_consistency_other_is_answer_asorg_in_trusted_answers: Optional[bool] = None - dns_consistency_other_is_answer_cloud_provider: Optional[bool] = None - dns_consistency_other_is_answer_probe_asn_match: Optional[bool] = None - dns_consistency_other_is_answer_probe_cc_match: Optional[bool] = None - dns_consistency_other_is_answer_bogon: Optional[bool] = None - dns_consistency_other_answer_fp_name: Optional[str] = None - dns_consistency_other_answer_fp_scope: Optional[str] = None - dns_consistency_other_is_answer_fp_match: Optional[bool] = None - dns_consistency_other_is_answer_fp_country_consistent: Optional[bool] = None - dns_consistency_other_is_answer_fp_false_positive: Optional[bool] = None - dns_consistency_other_is_resolver_probe_asn_match: Optional[bool] = None - dns_consistency_other_is_resolver_probe_cc_match: Optional[bool] = None - dns_consistency_other_answer_ip_ground_truth_asn_count: Optional[int] = None - dns_consistency_other_answer_asn_ground_truth_asn_count: Optional[int] = None - tls_success: Optional[bool] = None - tls_failure: Optional[str] = None - tls_is_tls_certificate_valid: Optional[bool] = None - tls_is_tls_certificate_invalid: Optional[bool] = None - tls_handshake_read_count: Optional[int] = None - tls_handshake_write_count: Optional[int] = None - tls_handshake_read_bytes: Optional[float] = None - tls_handshake_write_bytes: Optional[float] = None - tls_handshake_time: Optional[float] = None - tls_ground_truth_failure_count: Optional[int] = None - tls_ground_truth_failure_asn_cc_count: Optional[int] = None - tls_ground_truth_ok_count: Optional[int] = None - tls_ground_truth_ok_asn_cc_count: Optional[int] = None - tls_ground_truth_trusted_failure_count: Optional[int] = None - tls_ground_truth_trusted_ok_count: Optional[int] = None - tcp_address: Optional[str] = None - tcp_success: Optional[bool] = None - tcp_failure: Optional[str] = None - tcp_ground_truth_failure_count: Optional[int] = None - tcp_ground_truth_failure_asn_cc_count: Optional[int] = None - tcp_ground_truth_ok_count: Optional[int] = None - tcp_ground_truth_ok_asn_cc_count: Optional[int] = None - tcp_ground_truth_trusted_failure_count: Optional[int] = None - tcp_ground_truth_trusted_ok_count: Optional[int] = None - http_success: Optional[bool] = None - http_failure: Optional[str] = None - http_is_http_request_encrypted: Optional[bool] = None - http_response_body_proportion: Optional[float] = None - http_response_body_length: Optional[int] = None - http_response_status_code: Optional[int] = None - http_ground_truth_failure_count: Optional[int] = None - http_ground_truth_failure_asn_cc_count: Optional[int] = None - http_ground_truth_ok_count: Optional[int] = None - http_ground_truth_ok_asn_cc_count: Optional[int] = None - http_ground_truth_trusted_ok_count: Optional[int] = None - http_ground_truth_trusted_failure_count: Optional[int] = None - http_ground_truth_body_length: Optional[int] = None - http_fp_name: Optional[str] = None - http_fp_scope: Optional[str] = None - http_is_http_fp_match: Optional[bool] = None - http_is_http_fp_country_consistent: Optional[bool] = None - http_is_http_fp_false_positive: Optional[bool] = None diff --git a/oonidata/src/oonidata/models/dataformats.py b/oonidata/src/oonidata/models/dataformats.py index 03408819..94c0f357 100644 --- a/oonidata/src/oonidata/models/dataformats.py +++ b/oonidata/src/oonidata/models/dataformats.py @@ -3,6 +3,7 @@ See: https://github.com/ooni/spec/tree/master/data-formats """ + from base64 import b64decode import hashlib diff --git a/oonidata/src/oonidata/models/experiment_result.py b/oonidata/src/oonidata/models/experiment_result.py deleted file mode 100644 index a2c15d2b..00000000 --- a/oonidata/src/oonidata/models/experiment_result.py +++ /dev/null @@ -1,304 +0,0 @@ -import dataclasses -from dataclasses import dataclass -import logging -from typing import Any, Dict, Generator, List, Optional, NamedTuple, Mapping, Tuple -from enum import Enum -from datetime import datetime, timezone - -from tabulate import tabulate - -from ..datautils import maybe_elipse - -from .base import table_model -from .observations import ProbeMeta, MeasurementMeta, WebObservation - -log = logging.getLogger("oonidata.events") - - -class BlockingScope(Enum): - # n: national level blocking - NATIONAL_BLOCK = "n" - # i: isp level blocking - ISP_BLOCK = "i" - # l: local blocking (school, office, home network) - LOCAL_BLOCK = "l" - # s: server-side blocking - SERVER_SIDE_BLOCK = "s" - # t: this is a signal indicating some form of network throttling - THROTTLING = "t" - # u: unknown blocking scope - UNKNOWN = "u" - - -def fp_to_scope( - scope: Optional[str], -) -> BlockingScope: - # "nat" national level blockpage - # "isp" ISP level blockpage - # "prod" text pattern related to a middlebox product - # "inst" text pattern related to a voluntary instition blockpage (school, office) - # "vbw" vague blocking word - # "fp" fingerprint for false positives - if scope == "nat": - return BlockingScope.NATIONAL_BLOCK - elif scope == "isp": - return BlockingScope.ISP_BLOCK - elif scope == "inst": - return BlockingScope.LOCAL_BLOCK - elif scope == "fp": - return BlockingScope.SERVER_SIDE_BLOCK - - return BlockingScope.UNKNOWN - - -class Scores(NamedTuple): - ok: float - down: float - blocked: float - - -class Outcome(NamedTuple): - observation_id: str - subject: str - scope: BlockingScope - category: str - detail: str - meta: Mapping[str, str] - label: str - - ok_score: float - down_score: float - blocked_score: float - - -@table_model( - table_name="measurement_experiment_result", - table_index=( - "measurement_uid", - "timeofday", - ), -) -@dataclass -class MeasurementExperimentResult: - measurement_meta: MeasurementMeta - probe_meta: ProbeMeta - - # The list of observations used to generate this experiment result - observation_id_list: List[str] - - # The timeofday for which this experiment result is relevant. We use the - # timeofday convention to differentiate it from the timestamp which is an - # instant, while experiment results might indicate a range - timeofday: datetime - - # When this experiment result was created - created_at: datetime - - # Location attributes are relevant to qualify the location for which an - # experiment result is relevant - # The primary key for the location is the tuple: - # (location_network_type, location_network_asn, location_network_cc, location_resolver_asn) - - # Maybe in the future we have this when we get geoip through other menas - # location_region_cc: str - # location_region_name: str - location_network_type: str - - location_network_asn: int - location_network_cc: str - - location_network_as_org_name: str - location_network_as_cc: str - - # Maybe this should be dropped, as it would make the dimension potentially explode. - # location_resolver_ip: Optional[str] - location_resolver_asn: Optional[int] - location_resolver_as_org_name: Optional[str] - location_resolver_as_cc: Optional[str] - location_resolver_cc: Optional[str] - - # The blocking scope signifies at which level we believe the blocking to be - # implemented. - # We put it in the location keys, since effectively the location definition - # is relevant to map where in the network and space the blocking is - # happening and is not necessarily a descriptor of the location of the - # vantage points used to determine this. - # - # The scope can be: nat, isp, inst, fp to indicate national level blocking, - # isp level blocking, local blocking (eg. university or comporate network) - # or server-side blocking. - location_blocking_scope: Optional[str] - - # Should we include this or not? Benefit of dropping it is that it collapses - # the dimension when we do non-instant experiment results. - # platform_name: Optional[str] - - # Target nettest group is the high level experiment group taxonomy, but may - # in the future include also other more high level groupings. - target_nettest_group: str - # Target Category can be a citizenlab category code. Ex. GRP for social - # networking - target_category: str - # This is a more granular, yet high level definition of the target. Ex. - # facebook for all endpoints related to facebook - target_name: str - # This is the domain name associated with the target, for example for - # facebook it will be www.facebook.com, but also edge-mqtt.facebook.com - target_domain_name: str - # This is the more granular level associated with a target, for example the IP, port tuple - target_detail: str - - # Likelyhood of network interference values which define a probability space - loni_ok_value: float - - # These are key value mappings that define how likely a certain class of - # outcome is. Effectively it's an encoding of a dictionary, but in a way - # where it's more efficient to peform operations on them. - # Example: {"ok": 0.1, "down": 0.2, "blocked.dns": 0.3, "blocked.tls": 0.4} - # would be encoded as: - # - # loni_ok_value: 0.1 - # loni_down_keys: ["down"] - # loni_down_values: [0.2] - # loni_blocked_keys: ["blocked.dns", "blocked.tls"] - # loni_blocked_values: [0.3, 0.4] - loni_down_keys: List[str] - loni_down_values: List[float] - - loni_blocked_keys: List[str] - loni_blocked_values: List[float] - - loni_ok_keys: List[str] - loni_ok_values: List[float] - - # Encoded as JSON - loni_list: List[Dict] - - # Inside this string we include a representation of the logic that lead us - # to produce the above loni values - analysis_transcript_list: List[List[str]] - - # Number of measurements used to produce this experiment result - measurement_count: int - # Number of observations used to produce this experiment result - observation_count: int - # Number of vantage points used to produce this experiment result - vp_count: int - - # Backward compatible anomaly/confirmed flags - anomaly: Optional[bool] - confirmed: Optional[bool] - - -class ExperimentResult(NamedTuple): - __table_name__ = "experiment_result" - - measurement_uid: str - observation_id: str - report_id: str - input: Optional[str] - timestamp: datetime - created_at: datetime - - probe_asn: int - probe_cc: str - - probe_as_org_name: str - probe_as_cc: str - - network_type: str - - resolver_ip: Optional[str] - resolver_asn: Optional[int] - resolver_as_org_name: Optional[str] - resolver_as_cc: Optional[str] - resolver_cc: Optional[str] - - anomaly: bool - confirmed: bool - - ## These fields will be shared by multiple experiment results in a given - ## measurement - # Indicates the experiment group for this particular result, ex. im, - # websites, circumvention - experiment_group: str - # The domain name for the specified target - domain_name: str - # A string indicating the name of the target, ex. Signal, Facebook website - target_name: str - - ## These fields are unique to a particular experiment result - # A string indicating the subject of this experiment result, for example an - # IP:port combination. - subject: str - # In the event of blocking, indicates to what extent the blocking is - # happening: ISP, National, Local, Server Side, Throttling, Unknown - outcome_scope: str - # Specifies the category of the outcome, usually indicating the protocol for - # which we saw the block, ex. dns, tcp, tls, http, https - outcome_category: str - # Specifies, within the given class, what were the details of the outcome, ex. connection_reset, timeout, etc. - outcome_detail: str - # Additional metadata which can be used by an analyst to understand why the - # analysis engine came to a certain conclusion - outcome_meta: Mapping[str, str] - - # An additional label useful for assessing the metrics of the analysis - # engine. - # For example it can be used to include the blocking fingerprint flag. - outcome_label: str - - # These are scores which estimate the likelyhood of this particular subject - # being reachable, down or blocked. - # The sum of all the scores for a given outcome will be 1.0 - ok_score: float - down_score: float - blocked_score: float - - experiment_result_id: str - - -def iter_experiment_results( - obs: WebObservation, - experiment_group: str, - anomaly: bool, - confirmed: bool, - domain_name: str, - target_name: str, - outcomes: List[Outcome], -) -> Generator[ExperimentResult, None, None]: - created_at = datetime.now(timezone.utc).replace(tzinfo=None) - for idx, outcome in enumerate(outcomes): - yield ExperimentResult( - measurement_uid=obs.measurement_meta.measurement_uid, - created_at=created_at, - report_id=obs.measurement_meta.report_id, - input=obs.measurement_meta.input, - timestamp=obs.measurement_meta.measurement_start_time, - probe_asn=obs.probe_meta.probe_asn, - probe_cc=obs.probe_meta.probe_cc, - probe_as_org_name=obs.probe_meta.probe_as_org_name, - probe_as_cc=obs.probe_meta.probe_as_cc, - network_type=obs.probe_meta.network_type, - resolver_ip=obs.probe_meta.resolver_ip, - resolver_asn=obs.probe_meta.resolver_asn, - resolver_as_org_name=obs.probe_meta.resolver_as_org_name, - resolver_as_cc=obs.probe_meta.resolver_as_cc, - resolver_cc=obs.probe_meta.resolver_cc, - experiment_result_id=f"{obs.measurement_meta.measurement_uid}_{idx}", - experiment_group=experiment_group, - anomaly=anomaly, - confirmed=confirmed, - domain_name=domain_name, - target_name=target_name, - observation_id=outcome.observation_id, - subject=outcome.subject, - outcome_scope=outcome.scope.value, - outcome_category=outcome.category, - outcome_detail=outcome.detail, - outcome_meta=outcome.meta, - outcome_label=outcome.label, - ok_score=outcome.ok_score, - down_score=outcome.down_score, - blocked_score=outcome.blocked_score, - ) diff --git a/oonipipeline/src/oonipipeline/__about__.py b/oonipipeline/src/oonipipeline/__about__.py index f46a802b..ae32ac1a 100644 --- a/oonipipeline/src/oonipipeline/__about__.py +++ b/oonipipeline/src/oonipipeline/__about__.py @@ -1 +1 @@ -VERSION = "5.0.0rc0" +VERSION = "5.0.0rc1" diff --git a/oonipipeline/src/oonipipeline/analysis/control.py b/oonipipeline/src/oonipipeline/analysis/control.py deleted file mode 100644 index 63f39a9b..00000000 --- a/oonipipeline/src/oonipipeline/analysis/control.py +++ /dev/null @@ -1,418 +0,0 @@ -from datetime import date, timedelta, datetime -import logging -import sqlite3 -from collections.abc import Iterable - -from typing import Any, Generator, Optional, Tuple, List, NamedTuple - -from oonidata.models.observations import WebControlObservation, WebObservation - -from ..netinfo import NetinfoDB - -from ..db.connections import ClickhouseConnection - -log = logging.getLogger(__name__) - - -class WebGroundTruth(NamedTuple): - vp_asn: int - vp_cc: str - is_trusted_vp: bool - - hostname: str - ip: Optional[str] - port: Optional[int] - - dns_failure: Optional[str] - dns_success: Optional[bool] - - tcp_failure: Optional[str] - tcp_success: Optional[bool] - - tls_failure: Optional[str] - tls_success: Optional[bool] - tls_is_certificate_valid: Optional[bool] - - http_request_url: Optional[str] - http_failure: Optional[str] - http_success: Optional[bool] - http_response_body_length: Optional[int] - - timestamp: datetime - count: int - - ip_asn: Optional[int] - ip_as_org_name: Optional[str] - - -def iter_ground_truths_from_web_control( - web_control_observations: List[WebControlObservation], - netinfodb: NetinfoDB, - count: int = 1, -) -> Generator[Tuple[Tuple[str, ...], List], None, None]: - # TODO: pass a netinfodb to lookup the ip_asn and ip_as_org_name - for obs in web_control_observations: - ip_as_org_name = "" - ip_asn = 0 - if obs.ip: - ip_info = netinfodb.lookup_ip( - obs.measurement_meta.measurement_start_time, obs.ip - ) - ip_asn = ip_info.as_info.asn - ip_as_org_name = ip_info.as_info.as_org_name - - wgt = WebGroundTruth( - vp_asn=0, - vp_cc="ZZ", - timestamp=obs.measurement_meta.measurement_start_time, - is_trusted_vp=True, - hostname=obs.hostname, - ip=obs.ip, - ip_asn=ip_asn, - ip_as_org_name=ip_as_org_name, - port=obs.port, - dns_failure=obs.dns_failure, - dns_success=obs.dns_success, - tcp_failure=obs.tcp_failure, - tcp_success=obs.tcp_success, - tls_failure=obs.tls_failure, - tls_success=obs.tls_success, - tls_is_certificate_valid=obs.tls_failure is None - and obs.tls_success is True, - http_request_url=obs.http_request_url, - http_failure=obs.http_failure, - http_success=obs.http_success, - http_response_body_length=obs.http_response_body_length, - count=count, - ) - yield WebGroundTruth._fields, list(wgt) - - -def iter_web_ground_truths( - db: ClickhouseConnection, netinfodb: NetinfoDB, measurement_day: date -) -> Generator[Tuple[List[str], List], None, None]: - start_day = measurement_day.strftime("%Y-%m-%d") - end_day = (measurement_day + timedelta(days=1)).strftime("%Y-%m-%d") - column_names = [ - "timestamp", - "hostname", - "ip", - "port", - "dns_failure", - "dns_success", - "tcp_failure", - "tcp_success", - "tls_failure", - "tls_success", - "tls_is_certificate_valid", - "http_request_url", - "http_failure", - "http_success", - ] - q = """ - SELECT ( - toStartOfDay(measurement_start_time) as timestamp, - hostname, - ip, - port, - dns_failure, - dns_success, - tcp_failure, - tcp_success, - tls_failure, - tls_success, - (tls_failure is NULL AND tls_success = 1) AS tls_is_certificate_valid, - http_request_url, - http_failure, - http_success, - arrayMax(topK(1)(http_response_body_length)) as http_response_body_length, - COUNT() - ) - FROM obs_web_ctrl - WHERE measurement_start_time > %(start_day)s AND measurement_start_time < %(end_day)s - """ - q += "GROUP BY " - q += ",".join(column_names) - - for res in db.execute_iter(q, dict(start_day=start_day, end_day=end_day)): - row = res[0] - - c_names = column_names + [ - "http_response_body_length", - "count", - "ip_asn", - "ip_as_org_name", - "vp_asn", - "vp_cc", - "is_trusted_vp", - ] - row_extra: List[Any] = [None, None] - # TODO move this directly into the obs_web_ctrl table - if row[2]: - ip_info = netinfodb.lookup_ip(row[0], row[2]) - row_extra = [ip_info.as_info.asn, ip_info.as_info.as_org_name] - - # vp_asn, vp_cc, is_trusted_vp - row_extra.append(0) - row_extra.append("ZZ") - row_extra.append(1) - - yield c_names, row + tuple(row_extra) - - -class WebGroundTruthDB: - """ - The Web Ground Truth database is used by the websites experiment results - processor for looking up ground truths related to a particular set of - measurements. - - Currently it's implemented through an in-memory SQLite databases which - contains all the ground_truths for a particular day. - """ - - _indexes = ( - ("hostname_idx", "hostname, vp_asn, vp_cc"), - ("ip_port_idx", "ip, port, vp_asn, vp_cc"), - ("http_request_url_idx", "http_request_url, vp_asn, vp_cc"), - ) - column_names = WebGroundTruth._fields - - def __init__(self, connect_str: str = ":memory:"): - self._table_name = "ground_truth" - self.db = sqlite3.connect(connect_str) - self.db.execute("pragma synchronous = normal;") - self.db.execute("pragma journal_mode = WAL;") - self.db.execute("pragma temp_store = memory;") - - def build_from_rows(self, rows: Iterable): - self.db.execute(self.create_query) - self.db.commit() - - for column_names, row in rows: - v_str = ",".join(["?" for _ in range(len(column_names))]) - q_insert_with_values = ( - f"{self.insert_query(column_names=column_names)} VALUES ({v_str})" - ) - self.db.execute(q_insert_with_values, row) - self.db.commit() - self.db.execute("pragma vacuum;") - self.db.execute("pragma optimize;") - self.create_indexes() - - def count_rows(self) -> int: - row = self.db.execute(f"SELECT COUNT() FROM {self._table_name};").fetchone() - assert len(row) == 1 - return row[0] - - def build_from_existing(self, db_str: str): - with sqlite3.connect(db_str) as src_db: - self.db = sqlite3.connect(":memory:") - src_db.backup(self.db) - self.db.commit() - - def close(self): - self.db.close() - - def create_indexes(self): - for idx_name, idx_value in self._indexes: - self.db.execute( - f"CREATE INDEX {self._table_name}_{idx_name} ON {self._table_name}({idx_value})" - ) - self.db.commit() - - @property - def create_query(self): - return f""" - CREATE TABLE {self._table_name} ( - vp_asn INT, - vp_cc TEXT, - is_trusted_vp INT, - - timestamp TEXT, - - hostname TEXT, - ip TEXT, - ip_asn INT, - ip_as_org_name TEXT, - port INT, - - dns_failure TEXT, - dns_success INT, - - tcp_failure TEXT, - tcp_success INT, - - tls_failure TEXT, - tls_success INT, - tls_is_certificate_valid INT, - - http_request_url TEXT, - http_failure TEXT, - http_success INT, - http_response_body_length INT, - count INT - ) - """ - - def insert_query(self, column_names: List[str]): - c_str = ",".join(column_names) - q_str = f"INSERT INTO {self._table_name} ({c_str})\n" - return q_str - - def select_query( - self, - table_name: str, - probe_cc: str, - probe_asn: int, - hostnames: Optional[List[str]] = None, - ip_ports: Optional[List[Tuple[str, Optional[int]]]] = None, - http_request_urls: Optional[List[str]] = None, - ) -> Tuple[str, List]: - assert ( - hostnames or ip_ports or http_request_urls - ), "one of either hostnames or ip_ports or http_request_urls should be set" - c_str = ",\n".join( - map( - lambda r: r if r != "count" else "SUM(count) as count", - self.column_names, - ) - ) - q = f""" - SELECT - {c_str} - FROM {table_name} - WHERE vp_asn != ? AND vp_cc != ? AND ( - """ - # We want to exclude all the ground truths that are from the same - # vantage point as the probe - q_args = [probe_cc, probe_asn] - - sub_query_parts = [] - if hostnames: - sub_q = "(" - sub_q += "OR ".join( - # When hostname was supplied, we only care about it in relation - # to DNS resolutions, so we only get DNS failure or DNS success - # rows - [ - " hostname = ? AND (dns_success = 1 OR dns_failure IS NOT NULL) " - for _ in range(len(hostnames)) - ] - ) - sub_q += ")" - q_args += hostnames - sub_query_parts.append(sub_q) - - if ip_ports: - sub_q = "(" - ip_port_l = [] - for ip, port in ip_ports: - assert ip is not None, "empty IP in query" - ip_port_q = "(ip = ?" - q_args.append(ip) - if port is not None: - ip_port_q += " AND port = ?" - q_args.append(port) - ip_port_q += ")" - ip_port_l.append(ip_port_q) - sub_q += "OR ".join(ip_port_l) - sub_q += ")" - sub_query_parts.append(sub_q) - - if http_request_urls: - sub_q = "(" - sub_q += "OR ".join( - [" http_request_url = ?" for _ in range(len(http_request_urls))] - ) - sub_q += ")" - q_args += http_request_urls - sub_query_parts.append(sub_q) - - q += "OR ".join(sub_query_parts) - q += ")" - q += "GROUP BY " - aggregate_columns = list(self.column_names) - aggregate_columns.remove("count") - q += ", ".join(aggregate_columns) - return q, q_args - - def iter_select( - self, - probe_cc: str, - probe_asn: int, - hostnames: Optional[List[str]] = None, - ip_ports: Optional[List[Tuple[str, Optional[int]]]] = None, - http_request_urls: Optional[List[str]] = None, - ) -> Generator[Tuple[Tuple[str, ...], Any], None, None]: - q, q_args = self.select_query( - table_name=self._table_name, - probe_cc=probe_cc, - probe_asn=probe_asn, - hostnames=hostnames, - ip_ports=ip_ports, - http_request_urls=http_request_urls, - ) - for row in self.db.execute(q, q_args): - yield self.column_names, row - - def lookup( - self, - probe_cc: str, - probe_asn: int, - hostnames: Optional[List[str]] = None, - ip_ports: Optional[List[Tuple[str, Optional[int]]]] = None, - http_request_urls: Optional[List[str]] = None, - ) -> List[WebGroundTruth]: - iter_rows = self.iter_select( - probe_cc=probe_cc, - probe_asn=probe_asn, - hostnames=hostnames, - ip_ports=ip_ports, - http_request_urls=http_request_urls, - ) - matches = [] - for column_names, row in iter_rows: - gt = WebGroundTruth(**dict(zip(column_names, row))) - matches.append(gt) - return matches - - def lookup_by_web_obs(self, web_obs: List[WebObservation]) -> List[WebGroundTruth]: - """ - Returns the list of WebGroundTruth that are relevant to a particular set - of related web observations. - - Every web_obs in the list needs to be related to the same probe_cc, - probe_asn pair. - """ - to_lookup_hostnames = set() - to_lookup_ip_ports = set() - to_lookup_http_request_urls = set() - probe_cc = web_obs[0].probe_meta.probe_cc - probe_asn = web_obs[0].probe_meta.probe_asn - for web_o in web_obs: - # All the observations in this group should be coming from the - # same probe - assert web_o.probe_meta.probe_cc == probe_cc - assert web_o.probe_meta.probe_asn == probe_asn - if web_o.hostname is not None: - to_lookup_hostnames.add(web_o.hostname) - if web_o.ip is not None: - to_lookup_ip_ports.add((web_o.ip, web_o.port)) - if web_o.http_request_url is not None: - to_lookup_http_request_urls.add(web_o.http_request_url) - - return self.lookup( - probe_cc=probe_cc, - probe_asn=probe_asn, - ip_ports=list(to_lookup_ip_ports), - http_request_urls=list(to_lookup_http_request_urls), - hostnames=list(to_lookup_hostnames), - ) - - -class BodyDB: - def __init__(self, db: ClickhouseConnection): - self.db = db - - def lookup(self, body_sha1: str) -> List[str]: - return [] diff --git a/oonipipeline/src/oonipipeline/analysis/datasources.py b/oonipipeline/src/oonipipeline/analysis/datasources.py deleted file mode 100644 index e052838d..00000000 --- a/oonipipeline/src/oonipipeline/analysis/datasources.py +++ /dev/null @@ -1,96 +0,0 @@ -import dataclasses -from datetime import date, timedelta -from typing import Generator, List, Optional - -from oonidata.models.observations import MeasurementMeta, ProbeMeta, WebObservation - -from ..db.connections import ClickhouseConnection - - -def iter_web_observations( - db: ClickhouseConnection, - measurement_day: date, - test_name: str, - probe_cc: Optional[List[str]] = None, -) -> Generator[List[WebObservation], None, None]: - """ - Generator which returns on each iteration a list of WebObservations that - share the same measurement_uid given the specified search criteria - (measurement_day, test_name and probe_cc). - """ - q_kwargs = dict( - start_day=measurement_day.strftime("%Y-%m-%d"), - end_day=(measurement_day + timedelta(days=1)).strftime("%Y-%m-%d"), - test_name=test_name, - ) - - measurement_meta_cols = [f.name for f in dataclasses.fields(MeasurementMeta)] - probe_meta_cols = [f.name for f in dataclasses.fields(ProbeMeta)] - obs_cols = [f.name for f in dataclasses.fields(WebObservation)] - obs_cols.remove("probe_meta") - obs_cols.remove("measurement_meta") - column_names = measurement_meta_cols + probe_meta_cols + obs_cols - - q = "SELECT (" - q += ",\n".join(column_names) - q += ") FROM obs_web\n" - q += "WHERE measurement_start_time > %(start_day)s AND measurement_start_time < %(end_day)s AND test_name = %(test_name)s\n" - if probe_cc and len(probe_cc) > 0: - q += "AND probe_cc IN (" - probe_cc_args = [] - for idx, cc in enumerate(probe_cc): - q_kwargs[f"probe_cc_{idx}"] = cc - probe_cc_args.append(f"%(probe_cc_{idx})s") - q += ",".join(probe_cc_args) - q += ")" - q += "ORDER BY measurement_uid" - - obs_group = [] - last_msmt_uid = None - msmt_uid_idx = column_names.index("measurement_uid") - for res in db.execute_iter(q, q_kwargs): - row = res[0] - if not last_msmt_uid: - last_msmt_uid = row[msmt_uid_idx] - if row[msmt_uid_idx] != last_msmt_uid: - yield obs_group - last_msmt_uid = row[msmt_uid_idx] - obs_group = [] - - # TODO(art): this is super sketchy. - # We need to do this in order to obtain the correct offsets into the queried columns - # Basically probe_meta, measurement_meta are class - # attributes that are composed into the dataclass, however in the - # database they need to be stored flat, since nesting is not desirable. - # What we are doing here is figuring out how to construct the nested - # class in order by manually recomputing the offsets of the returned - # queries. - # If we had an ORM this might be all avoided and even without it there - # is probably a better pattern. - # See: https://github.com/ooni/data/issues/77 - measurement_meta = dict( - zip(measurement_meta_cols, row[: len(measurement_meta_cols)]) - ) - probe_meta = dict( - zip( - probe_meta_cols, - row[ - len(measurement_meta_cols) : len(measurement_meta_cols) - + len(probe_meta_cols) - ], - ) - ) - - rest = dict( - zip(obs_cols, row[len(measurement_meta_cols) + len(probe_meta_cols) :]) - ) - obs_group.append( - WebObservation( - measurement_meta=MeasurementMeta(**measurement_meta), - probe_meta=ProbeMeta(**probe_meta), - **rest, - ) - ) - - if len(obs_group) > 0: - yield obs_group diff --git a/oonipipeline/src/oonipipeline/analysis/signal.py b/oonipipeline/src/oonipipeline/analysis/signal.py deleted file mode 100644 index 97f1639f..00000000 --- a/oonipipeline/src/oonipipeline/analysis/signal.py +++ /dev/null @@ -1,264 +0,0 @@ -from typing import List, Generator - -from oonidata.models.experiment_result import ( - BlockingScope, - ExperimentResult, - Outcome, - fp_to_scope, - iter_experiment_results, -) -from oonidata.models.observations import WebObservation - -from ..fingerprintdb import FingerprintDB - - -## TODO(art): port this over to the new MeasurementExperimentResult model - - -def make_signal_experiment_result( - web_observations: List[WebObservation], - fingerprintdb: FingerprintDB, -) -> Generator[ExperimentResult, None, None]: - confirmed = False - anomaly = False - experiment_group = "im" - target_name = "signal" - outcome_label = "" - - outcomes = [] - # This DNS query is used by signal to figure out if some of it's - # services are down. - # see: https://github.com/signalapp/Signal-Android/blob/c4bc2162f23e0fd6bc25941af8fb7454d91a4a35/app/src/main/java/org/thoughtcrime/securesms/jobs/ServiceOutageDetectionJob.java#L25 - # TODO: should we do something in the case in which we can't tell - # because DNS blocking is going on (ex. in Iran)? - signal_is_down = ( - len( - list( - filter( - lambda o: ( - o.hostname == "uptime.signal.org" - and o.dns_answer == "127.0.0.2" - ), - web_observations, - ) - ) - ) - > 0 - ) - - for web_o in web_observations: - dns_blocked = False - tcp_blocked = False - - if web_o.hostname == "uptime.signal.org": - # we don't care about the signal uptime query results - continue - - if web_o.dns_failure: - anomaly = True - outcome_meta = {} - outcome_meta["why"] = "dns failure" - outcomes.append( - Outcome( - observation_id=f"{web_o.measurement_meta.measurement_uid}_{web_o.observation_idx}", - scope=BlockingScope.UNKNOWN, - subject=f"{web_o.hostname}", - category="dns", - label="", - detail=f"{web_o.dns_failure}", - meta={}, - blocked_score=0.8, - down_score=0.2, - ok_score=0.0, - ) - ) - continue - - if web_o.dns_answer and not web_o.tls_is_certificate_valid: - # We don't set the anomaly flag, because this logic is very - # susceptible to false positives - # anomaly = True - outcome_meta = {} - outcome_meta["why"] = "tls is inconsistent" - outcome_meta["ip"] = web_o.dns_answer - - blocked_score = 0.6 - down_score = 0.0 - blocking_scope = BlockingScope.UNKNOWN - fp = fingerprintdb.match_dns(web_o.dns_answer) - if fp: - blocking_scope = fp_to_scope(fp.scope) - if blocking_scope != BlockingScope.SERVER_SIDE_BLOCK: - dns_blocked = True - confirmed = True - anomaly = True - outcome_label = "blocked" - outcome_meta["fingerprint"] = fp.name - # TODO: add country consistency checks - - # Having a TLS inconsistency is a much stronger indication than not - # knowing. - if web_o.tls_is_certificate_valid == False: - # In these case we ignore TCP failures, since it's very likely - # to be DNS based. - dns_blocked = True - anomaly = True - blocked_score = 0.8 - - # TODO: Is this reasonable? - if signal_is_down == True and confirmed == False: - down_score = 0.8 - blocked_score = 0.0 - - outcomes.append( - Outcome( - observation_id=f"{web_o.measurement_meta.measurement_uid}_{web_o.observation_idx}", - scope=BlockingScope.UNKNOWN, - subject=f"{web_o.hostname}", - category="dns", - label=outcome_label, - detail=f"{web_o.dns_failure}", - meta={}, - blocked_score=blocked_score, - down_score=down_score, - ok_score=1 - (blocked_score + down_score), - ) - ) - else: - outcomes.append( - Outcome( - observation_id=f"{web_o.measurement_meta.measurement_uid}_{web_o.observation_idx}", - scope=BlockingScope.UNKNOWN, - subject=f"{web_o.hostname}", - category="dns", - label=outcome_label, - detail=f"{web_o.dns_failure}", - meta={"why": "TLS consistent answer"}, - blocked_score=0.2, - down_score=0.0, - ok_score=0.8, - ) - ) - - if not dns_blocked and web_o.tcp_failure: - down_score = 0.0 - blocked_score = 0.7 - anomaly = True - tcp_blocked = True - if signal_is_down == True: - down_score = 0.9 - blocked_score = 0.0 - anomaly = False - - outcomes.append( - Outcome( - observation_id=f"{web_o.measurement_meta.measurement_uid}_{web_o.observation_idx}", - scope=BlockingScope.UNKNOWN, - subject=f"{web_o.ip}:{web_o.port}", - category="tcp", - label="", - detail=f"{web_o.tcp_failure}", - meta={"why": "tcp failure"}, - blocked_score=blocked_score, - down_score=down_score, - ok_score=1 - (blocked_score + down_score), - ) - ) - - elif not dns_blocked and web_o.tcp_success: - outcomes.append( - Outcome( - observation_id=f"{web_o.measurement_meta.measurement_uid}_{web_o.observation_idx}", - scope=BlockingScope.UNKNOWN, - subject=f"{web_o.ip}:{web_o.port}", - category="tcp", - label=outcome_label, - detail=f"ok", - meta={}, - blocked_score=0.0, - down_score=0.0, - ok_score=1.0, - ) - ) - - if ( - not dns_blocked - and not tcp_blocked - and web_o.tls_failure - and not web_o.tls_failure.startswith("ssl_") - ): - down_score = 0.3 - blocked_score = 0.7 - anomaly = True - if signal_is_down == True: - down_score = 0.9 - blocked_score = 0.1 - anomaly = False - - outcomes.append( - Outcome( - observation_id=f"{web_o.measurement_meta.measurement_uid}_{web_o.observation_idx}", - scope=BlockingScope.UNKNOWN, - subject=f"{web_o.hostname}", - category="tls", - label="", - detail=f"{web_o.tls_failure}", - meta={}, - blocked_score=blocked_score, - down_score=down_score, - ok_score=1 - (blocked_score + down_score), - ) - ) - - # TODO: to do this properly we need to rule out cases in which the - # certificate is invalid due to bad DNS vs it being invalid due to TLS - # MITM. Doing so requires a ground truth which we should eventually add. - elif web_o.tls_is_certificate_valid == False: - # TODO: maybe refactor this with the above switch case - down_score = 0.1 - blocked_score = 0.9 - anomaly = True - if signal_is_down == True: - down_score = 0.9 - blocked_score = 0.1 - anomaly = False - - outcomes.append( - Outcome( - observation_id=f"{web_o.measurement_meta.measurement_uid}_{web_o.observation_idx}", - scope=BlockingScope.UNKNOWN, - subject=f"{web_o.hostname}", - category="tls", - label="", - detail=f"ssl_invalid_certificate", - meta={}, - blocked_score=blocked_score, - down_score=down_score, - ok_score=1 - (blocked_score + down_score), - ) - ) - elif not dns_blocked and not tcp_blocked and web_o.tls_cipher_suite is not None: - outcomes.append( - Outcome( - observation_id=f"{web_o.measurement_meta.measurement_uid}_{web_o.observation_idx}", - scope=BlockingScope.UNKNOWN, - subject=f"{web_o.hostname}", - category="tls", - label="", - detail="ok", - meta={}, - blocked_score=0.0, - down_score=0.0, - ok_score=1.0, - ) - ) - - return iter_experiment_results( - obs=web_observations[0], - experiment_group=experiment_group, - domain_name=target_name, - target_name=target_name, - anomaly=anomaly, - confirmed=confirmed, - outcomes=outcomes, - ) diff --git a/oonipipeline/src/oonipipeline/analysis/web_analysis.py b/oonipipeline/src/oonipipeline/analysis/web_analysis.py index 0d85f81d..35317c02 100644 --- a/oonipipeline/src/oonipipeline/analysis/web_analysis.py +++ b/oonipipeline/src/oonipipeline/analysis/web_analysis.py @@ -1,979 +1,564 @@ import logging -import ipaddress -import dataclasses - -from collections import defaultdict -from dataclasses import dataclass -from datetime import datetime, timezone -from typing import ( - Generator, - Iterable, - Optional, - List, - Dict, -) -from oonidata.models.analysis import WebAnalysis -from oonidata.models.observations import WebControlObservation, WebObservation -from ..db.connections import ClickhouseConnection -from ..analysis.control import ( - WebGroundTruth, - BodyDB, -) -from ..fingerprintdb import FingerprintDB +from datetime import datetime +from typing import Any, Dict, List, Optional +from ..db.connections import ClickhouseConnection log = logging.getLogger(__name__) -SYSTEM_RESOLVERS = ["system", "getaddrinfo", "golang_net_resolver", "go", "unknown"] CLOUD_PROVIDERS_ASNS = [ 13335, # Cloudflare: https://www.peeringdb.com/net/4224 + 209242, # Cloudflare London, LLC 20940, # Akamai: https://www.peeringdb.com/net/2 9002, # Akamai RETN + 16625, # Akamai Technologies, Inc. + 63949, # Akamai Technologies, Inc. + 16509, # Amazon.com, Inc. + 14618, # Amazon.com, Inc. + 15169, # Google LLC 396982, # Google Cloud: https://www.peeringdb.com/net/30878 + 54113, # Fastly, Inc + 8075, # Microsoft Corporation + 8068, # Microsoft Corporation ] -CLOUD_PROVIDERS_AS_ORGS_SUBSTRINGS = ["akamai"] - - -def get_web_ctrl_observations( - db: ClickhouseConnection, measurement_uid: str -) -> List[WebControlObservation]: - obs_list = [] - column_names = [f.name for f in dataclasses.fields(WebControlObservation)] - q = "SELECT (" - q += ",\n".join(column_names) - q += ") FROM obs_web_ctrl WHERE measurement_uid = %(measurement_uid)s" - - for res in db.execute_iter(q, {"measurement_uid": measurement_uid}): - row = res[0] - # TODO(art): IMPORTANT, fix this to make use of the nested column names - web_control_obs = WebControlObservation( - **{k: row[idx] for idx, k in enumerate(column_names)} - ) - obs_list.append(web_control_obs) - return obs_list - - -def is_cloud_provider(asn: Optional[int], as_org_name: Optional[str]): - if asn and asn in CLOUD_PROVIDERS_ASNS: - return True - if as_org_name and any( - [ss in as_org_name.lower() for ss in CLOUD_PROVIDERS_AS_ORGS_SUBSTRINGS] - ): - return True - return False - -def encode_address(ip: str, port: Optional[int]) -> str: - """ - return a properly encoded address handling IPv6 IPs +def format_query_analysis_web_fuzzy_logic( + start_time: datetime, + end_time: datetime, + probe_cc: List[str], + # We are only doing web_connectivity for the moment + test_name: List[str] = ["web_connectivity"], + measurement_uid: Optional[str] = None, +): + q_params: Dict[str, Any] = { + "start_time": start_time, + "end_time": end_time, + "cloud_provider_asns": CLOUD_PROVIDERS_ASNS, + } + and_where = [ + "measurement_start_time > %(start_time)s", + "measurement_start_time <= %(end_time)s", + ] + if len(probe_cc) > 0: + and_where.append("probe_cc IN %(probe_cc)s") + q_params["probe_cc"] = probe_cc + if len(test_name) > 0: + and_where.append("test_name IN %(test_name)s") + q_params["test_name"] = test_name + if measurement_uid is not None: + and_where.append("measurement_uid = %(measurement_uid)s") + q_params["measurement_uid"] = measurement_uid + + where_clause = " AND ".join(and_where) + + SQL = f""" + WITH + hasAny(union_tls_consistent_ips, dns_answers) as dns_tls_consistent, + hasAny(mapKeys(ctrl_tls_inconsistent_ips), dns_answers) as dns_tls_inconsistent, + hasAny(mapKeys(ctrl_dns_answers), dns_answers) as dns_answer_matches_ctrl, + hasAny(mapKeys(ctrl_dns_answers_asns), dns_answers_asns) as dns_answer_asn_matches_ctrl, + IF(dns_answers_contain_bogon IS NULL, 0, dns_answers_contain_bogon) as dns_answers_contain_bogon, + + cloud_provider_ips_count, + not_cloud_provider_ips_count, + + --union_tls_consistent_ips, + --ctrl_dns_answers, + + ctrl_dns_failure_count, + ctrl_dns_success_count, + ctrl_dns_success_count/(ctrl_dns_failure_count+ctrl_dns_success_count) as ctrl_dns_success_rate, + + --ctrl_tls_success_ips, + ctrl_tls_success_ips[ip] as ctrl_tls_success_count, + arraySum(mapValues(ctrl_tls_success_ips)) as ctrl_tls_success_sum, + length(mapValues(ctrl_tls_success_ips)) as ctrl_tls_success_ip_count, + + --ctrl_tls_inconsistent_ips, + + ctrl_tls_inconsistent_ips[ip] as ctrl_tls_inconsistent_count, + arraySum(mapValues(ctrl_tls_inconsistent_ips)) as ctrl_tls_inconsistent_sum, + + --ctrl_tls_failing_ips, + ctrl_tls_failing_ips[ip] as ctrl_tls_failing_count, + arraySum(mapValues(ctrl_tls_failing_ips)) as ctrl_tls_failing_sum, + length(mapValues(ctrl_tls_failing_ips)) as ctrl_tls_failing_ip_count, + + --ctrl_tls_success_rates, + ctrl_tls_success_rates[ip] as ctrl_tls_success_rate, + + --ctrl_tcp_success_ips, + ctrl_tcp_success_ips[ip] as ctrl_tcp_success_count, + arraySum(mapValues(ctrl_tcp_success_ips)) as ctrl_tcp_success_sum, + length(mapValues(ctrl_tcp_success_ips)) as ctrl_tcp_success_ip_count, + + --ctrl_tcp_failing_ips, + ctrl_tcp_failing_ips[ip] as ctrl_tcp_failing_count, + arraySum(mapValues(ctrl_tcp_failing_ips)) as ctrl_tcp_failing_sum, + length(mapValues(ctrl_tcp_failing_ips)) as ctrl_tcp_failing_ip_count, + + --ctrl_tcp_success_rates, + ctrl_tcp_success_rates[ip] as ctrl_tcp_success_rate, + + expected_countries, + dns_blocking_scope, + has(expected_countries, probe_cc) as dns_blocking_country_consistent, + + -- Possibility distributions of states (blocking, down, ok) is 0, 0, 0 + -- (i.e. we don't know anything) + multiIf( + -- We are dealing with a row that doesn't have any DNS data associated to it, + -- most likely a HTTP(s) only observation row. + -- We set the mask to False so that this can be excluded from any aggregate + -- analysis. + length(dns_answers) = 0 AND dns_failure IS NULL, + tuple(0.0, 0.0, 0.0), + + -- We matches a country blockpage, our possibility of blocking is 1. + dns_blocking_country_consistent, + tuple(1.0, 1.0, 0.0), + + -- We got a TLS consistent inside of DNS, this is a very strong signal that + -- the answer is good. + dns_tls_consistent > 0, + tuple(0.0, 0.0, 1.0), + + -- We got a bogon that we didn't see inside of the control. This is quite likely a + -- sign of blocking. + dns_answers_contain_bogon > 0 AND dns_answer_matches_ctrl = 0, + tuple(0.95, 0.05, 0.0), + + -- We got a bogon, but it's also inside the control. This is a DNS misconfiguration + -- so we mark it as down being more possible than blocked. + dns_answers_contain_bogon > 0 AND dns_answer_matches_ctrl > 0, + tuple(0.1, 0.9, 0.0), + + -- We got a TLS inconsistent answer (ie. certificates are failing) and this + -- specific answer was never seen inside of the control. + -- This signifies that we are most likely dealing with a case of true blocking. + dns_tls_inconsistent > 0 AND dns_answer_matches_ctrl = 0, + tuple(0.9, 0.05, 0.05), + + -- We got a direct match for an answer in the control. This is also a strong signal + -- that we got something good. + dns_answer_matches_ctrl > 0, + tuple(0.0, 0.0, 0.9), + + -- The DNS answers contain a matching ASN comparing experiment and control. + -- Usually this is a sign that it's a valid answer, especially if we didn't trigger + -- the previous checks. + dns_answer_asn_matches_ctrl > 0, + tuple(0.2, 0.0, 0.8), + + -- DNS is failing, but it's also failing a lot in the control. There is likely some kind of issue + -- with the DNS configuration of the fqdn (eg. it doesn't exist and we are getting NXDOMAIN) + dns_failure IS NOT NULL AND ctrl_dns_success_rate <= 0.5, + tuple(0.1, 0.9, 0.0), + + -- DNS is failing, but it's suceeding inside our control. This is likely a case of true blocking. + dns_failure IS NOT NULL AND ctrl_dns_success_rate > 0.5, + tuple(0.9, 0.1, 0.0), + + dns_failure IS NOT NULL, + tuple(0.5, 0.5, 0), + + dns_failure IS NULL, + tuple(0.75, 0, 0.25), + + tuple(0.0, 0.0, 0.0) + + ) as dns_outcome, + + multiIf( + -- We are dealing with a row that doesn't have any TCP data associated to it, + -- most likely a HTTP(s) only observation row. + -- We set the mask to False so that this can be excluded from any aggregate + -- analysis. + tcp_success != 1 AND tcp_failure IS NULL, + tuple(0, 0, 0), + + -- We can connect, so there is nothing to see here. + tcp_failure IS NULL AND tcp_success = 1, + tuple(0, 0, 1.0), + + -- We are seeing some failure, DNS was OK, yet the target address is IPv6 and we are seeing a lot of + -- failing IPv6 on the whole report_id set. This likely means that the probe has a broken IPv6 configuration. + -- We therefore set the mask to False so we exclude it from analysis. + tcp_failure IS NOT NULL AND ip_is_v6 = 1 AND tcp_ipv6_failure_rate > 0.5, + tuple(0, 0, 0), + + -- We got a failure, yet this particular address is mostly succeeding in control. + -- Let's mark it as blocked. + tcp_failure IS NOT NULL AND ctrl_tcp_success_rate > 0.5 AND ctrl_tcp_success_count > 0, + tuple(0.75, 0.25, 0), + + -- We didn't get a good DNS answer, so we can't do much to analyze this result set since we + -- can't trust what we saw in DNS, so we just return early and ignore this from the perspective of + -- a TCP analysis + dns_blocked > 0 AND dns_ok <= (dns_blocked + dns_down), + tuple(0, 0, 0), + + -- We got a failure, however control is also failing a lot. Let's mark it as down + tcp_failure IS NOT NULL AND ctrl_tcp_success_rate <= 0.5 AND ctrl_tcp_failing_count > 0, + tuple(0.25, 0.75, 0), + + tuple(0, 0, 0) + ) as tcp_outcome, + + multiIf( + -- # We are dealing with a row that doesn't have any TLS data associated to it, + -- # most likely a HTTP(s) only observation row. + -- # We set the mask to False so that this can be excluded from any aggregate + -- # analysis. + tls_is_certificate_valid IS NULL AND tls_failure IS NULL, + tuple(0, 0, 0), + + -- # We get a valid certificate, so there is nothing to see here. + tls_is_certificate_valid = 1, + tuple(0, 0, 1.0), + + -- # We got a failure, yet this particular address is mostly succeeding in control. + -- # Let's mark it as blocked. + tls_failure IS NOT NULL AND ctrl_tls_success_rate > 0.5 AND ctrl_tls_success_count > 0, + multiIf( + -- SSL related errors are more suspicious than others + startsWith(tls_failure, 'ssl_'), + tuple(0.9, 0.1, 0), + -- Connection reset carries more weight than timeouts and similar + tls_failure = 'connection_reset', + tuple(0.8, 0.2, 0), + tuple(0.7, 0.3, 0) + ), + + -- We didn't get a good DNS answer, so we can't do much to analyze this result set since we + -- can't trust what we saw in DNS, so we just return early and ignore this from the perspective of + -- a TCP analysis + dns_blocked > 0 AND dns_ok <= (dns_blocked + dns_down), + tuple(0, 0, 0), + + -- # The TCP analysis told us that this particular address is TCP blocked, + -- # therefore it's likely blocked via TCP and the TLS analysis should be + -- # thrown out. + tcp_blocked > 0 AND tcp_ok <= (tcp_blocked + tcp_down), + tuple(0, 0, 0), + + -- # We got a failure, however control is also failing a lot. Let's mark it as down. + tls_failure IS NOT NULL AND ctrl_tls_success_rate <= 0.5 AND ctrl_tls_failing_count > 0, + tuple(0.2, 0.8, 0), + + tuple(0, 0, 0) + ) as tls_outcome, + + ip, + ip_asn, + ip_is_bogon, + ip_is_v6, + + tcp_ipv6_failure_rate, + tcp_ipv4_failure_rate, + tcp_success, + tcp_failure, + tcp_t, + http_failure, + dns_outcome.1 as dns_blocked, + dns_outcome.2 as dns_down, + dns_outcome.3 as dns_ok, + + tcp_failure, + tcp_outcome.1 as tcp_blocked, + tcp_outcome.2 as tcp_down, + tcp_outcome.3 as tcp_ok, + + tls_failure, + tls_outcome.1 as tls_blocked, + tls_outcome.2 as tls_down, + tls_outcome.3 as tls_ok + + SELECT + -- We parse the domain from the input, like the current pipeline would. + -- It's not possible to get it from the hostname column, because if the + -- measurement included a redirect chain, we might have tested domains different + -- than that inside of the input field. + domain(input) as domain, + input, test_name, + probe_asn, probe_as_org_name, probe_cc, + resolver_asn, resolver_as_cc, + network_type, + measurement_start_time, + measurement_uid, + ooni_run_link_id, + + anyHeavy(probe_analysis) as top_probe_analysis, + + anyHeavy(dns_failure) as top_dns_failure, + anyHeavy(tcp_failure) as top_tcp_failure, + anyHeavy(tls_failure) as top_tls_failure, + + max(dns_blocked) as dns_blocked_max, + max(dns_down) as dns_down_max, + max(dns_ok) as dns_ok_max, + -- IF( + -- dns_blocked_max > (dns_down_max + dns_ok_max), + -- concat('dns.', IF(top_dns_failure IS NOT NULL, top_dns_failure, 'none') + -- ), ''), + + max(tcp_blocked) as tcp_blocked_max, + max(tcp_down) as tcp_down_max, + max(tcp_ok) as tcp_ok_max, + + max(tls_blocked) as tls_blocked_max, + max(tls_down) as tls_down_max, + max(tls_ok) as tls_ok_max + + FROM ( + WITH + position(ip, '.') = 0 as ip_is_v6, + position(ip, '.') != 0 as ip_is_v4 + + SELECT + measurement_uid, + ooni_run_link_id, + report_id, + hostname, + input, + probe_asn, probe_as_org_name, probe_cc, resolver_asn, resolver_as_cc, network_type, + measurement_start_time, test_name, + toStartOfDay(measurement_start_time) as measurement_day, + + ip, + ip_asn, + ip_is_bogon, + ip_is_v6, + dns_failure, + dns_answer, + + -- We limit this to only the system resolver + -- TODO: in order to fully support web_connectivity 0.5 we should ideally + -- parse this as well. + groupArrayIf(dns_answer, dns_engine IN ('getaddrinfo', 'system')) over (partition by measurement_uid, hostname, ip_is_v6) as dns_answers, + groupArrayIf(ip_asn, dns_engine IN ('getaddrinfo', 'system')) over (partition by measurement_uid, hostname, ip_is_v6) as dns_answers_asns, + maxIf(ip_is_bogon, dns_engine IN ('getaddrinfo', 'system')) over (partition by measurement_uid, hostname, ip_is_v6) as dns_answers_contain_bogon, + + countIf(ip_asn IN %(cloud_provider_asns)s) over (partition by measurement_uid) as dns_answers_cloud, + + -- We use these to get an indication of whether IPv6 is entirely broken in + -- this probe. + -- TODO: in the future we could use something other than report_id, but + -- closer to "run_id" to get all measurements from a particular probe at a + -- given time interval + countIf(ip_is_v6 AND tcp_failure IS NOT NULL) over (partition by report_id) as tcp_ipv6_failure_count, + countIf(ip_is_v6 AND tcp_success = 1) over (partition by report_id) as tcp_ipv6_success_count, + + countIf(ip_is_v4 AND tcp_success = 1) over (partition by report_id) as tcp_ipv4_success_count, + countIf(ip_is_v4 AND tcp_failure IS NOT NULL) over (partition by report_id) as tcp_ipv4_failure_count, + + tcp_ipv6_failure_count/(tcp_ipv6_success_count+tcp_ipv6_failure_count) as tcp_ipv6_failure_rate, + tcp_ipv4_failure_count/(tcp_ipv4_success_count+tcp_ipv4_failure_count) as tcp_ipv4_failure_rate, + + tcp_success, + tcp_failure, + tcp_t, + tls_is_certificate_valid, + tls_failure, + tls_handshake_time, + http_failure, + probe_analysis + + FROM + obs_web + WHERE + + {where_clause} + ) as experiment + + LEFT OUTER JOIN ( + SELECT + groupArray(expected_countries) as expected_countries, + pattern, + any(scope) as dns_blocking_scope + FROM fingerprints_dns + GROUP BY pattern + ) as fingerprints_dns + ON fingerprints_dns.pattern = experiment.dns_answer + + -- CTRL subquery + LEFT OUTER JOIN ( + SELECT + hostname, + measurement_day, + cloud_provider_ips_count, + not_cloud_provider_ips_count, + + arrayDistinct( + arrayConcat(mapKeys(ctrl_tls_success_ips), other_tls_consistent_ips) + ) as union_tls_consistent_ips, + + ctrl_dns_answers, + ctrl_dns_answers_asns, + ctrl_dns_failure_count, + ctrl_dns_success_count, + ctrl_tls_success_ips, + ctrl_tls_inconsistent_ips, + ctrl_tls_failing_ips, + + CAST( + arrayMap( + (ip) -> (ip, (ctrl_tls_success_ips[ip]/(ctrl_tls_success_ips[ip] + ctrl_tls_failing_ips[ip]))), + arrayConcat(mapKeys(ctrl_tls_success_ips), mapKeys(ctrl_tls_failing_ips)) + ), + 'Map(String, Float32)' + ) as ctrl_tls_success_rates, + + ctrl_tcp_success_ips, + ctrl_tcp_failing_ips, + CAST( + arrayMap( + (ip) -> (ip, (ctrl_tcp_success_ips[ip]/(ctrl_tcp_success_ips[ip] + ctrl_tcp_failing_ips[ip]))), + arrayConcat(mapKeys(ctrl_tcp_success_ips), mapKeys(ctrl_tcp_failing_ips)) + ), + 'Map(String, Float32)' + ) as ctrl_tcp_success_rates + + FROM + ( + WITH + CAST( + ([ip], [1]), + 'Map(String, UInt32)' + ) as ip_map, + + CAST( + ([IF(ip_asn IS NULL, 0, ip_asn)], [1]), + 'Map(UInt32, UInt32)' + ) as ip_asn_map + + SELECT + hostname, + toStartOfDay(measurement_start_time) as measurement_day, + + -- if the answer was inside of a cloud provider ASN + -- TODO: we aren't using it as part of the analysis. + length(groupUniqArrayIf(ip, ip_asn IN %(cloud_provider_asns)s)) as cloud_provider_ips_count, + length(groupUniqArrayIf(ip, ip_asn NOT IN %(cloud_provider_asns)s)) as not_cloud_provider_ips_count, + + -- list of DNS failures observed in the control for a given hostname + countIf(dns_failure IS NOT NULL) as ctrl_dns_failure_count, + countIf(dns_success = 1) as ctrl_dns_success_count, + + sumMapIf(ip_asn_map, dns_success = 1 AND ip_asn != 0) as ctrl_dns_answers_asns, + sumMapIf(ip_map, dns_success = 1) as ctrl_dns_answers, + + -- list of IPs that are TLS consistent for a given hostname (i.e. a TLS handshake succeeds) + --groupUniqArrayIf(ip, tls_success = 1) as ctrl_tls_success_ips, + sumMapIf(ip_map, tls_success = 1) as ctrl_tls_success_ips, + + -- list of IPs that are TLS inconsistent for a given hostname + -- (i.e. a TLS handshake fails with a certificate error) + --groupUniqArrayIf(ip, + sumMapIf(ip_map, + tls_success = 0 + AND tls_failure LIKE 'ssl_%%' + ) as ctrl_tls_inconsistent_ips, + + -- list of IPs that are TLS failing + --groupUniqArrayIf(ip, + sumMapIf(ip_map, + tls_success = 0 AND tls_failure IS NOT NULL + ) as ctrl_tls_failing_ips, + + -- list of IPs that are successful via TCP + --groupUniqArrayIf(ip, tcp_success = 1) as ctrl_tcp_success_ips, + sumMapIf( + ip_map, + tcp_success = 1 + ) as ctrl_tcp_success_ips, + + -- list of IPs that are failing + --groupUniqArrayIf(ip, tcp_success = 0) as ctrl_tcp_failing_ips + sumMapIf(ip_map, tcp_success = 0) as ctrl_tcp_failing_ips + + FROM + obs_web_ctrl + WHERE measurement_start_time > %(start_time)s + AND measurement_start_time <= %(end_time)s + GROUP BY hostname, measurement_day + ) AS ctrl + + LEFT OUTER JOIN + ( + SELECT + hostname, + toStartOfDay(measurement_start_time) as measurement_day, + groupArrayIf(ip, tls_is_certificate_valid = 1) as other_tls_consistent_ips + + FROM + obs_web + WHERE measurement_start_time > %(start_time)s + AND measurement_start_time <= %(end_time)s + GROUP BY hostname, measurement_day + ) as other + ON ctrl.hostname = other.hostname AND ctrl.measurement_day = other.measurement_day + ) as full_ctrl + ON full_ctrl.hostname = experiment.hostname AND full_ctrl.measurement_day = experiment.measurement_day + GROUP BY domain, + input, + probe_asn, probe_as_org_name, probe_cc, + resolver_asn, resolver_as_cc, + network_type, test_name, + measurement_start_time, + measurement_uid, + ooni_run_link_id """ - # I'm amazed python doesn't have this in the standard library - # and urlparse is incredibly inconsistent with it's handling of IPv6 - # addresses. - ipaddr = ipaddress.ip_address(ip) - addr = ip - if isinstance(ipaddr, ipaddress.IPv6Address): - addr = "[" + ip + "]" - - if port: - addr += f":{port}" - return addr - - -@dataclass -class TCPAnalysis: - address: str - success: bool - failure: Optional[str] - - ground_truth_failure_count: Optional[int] - ground_truth_failure_asn_cc_count: Optional[int] - ground_truth_ok_count: Optional[int] - ground_truth_ok_asn_cc_count: Optional[int] - - ground_truth_trusted_failure_count: Optional[int] - ground_truth_trusted_ok_count: Optional[int] - - -def make_tcp_analysis( - web_o: WebObservation, web_ground_truths: List[WebGroundTruth] -) -> TCPAnalysis: - assert web_o.ip is not None and web_o.port is not None - - blocking_subject = encode_address(web_o.ip, web_o.port) - - # It's working, wothing to see here, go on with your life - if web_o.tcp_success: - return TCPAnalysis( - address=blocking_subject, - success=True, - failure=None, - ground_truth_failure_asn_cc_count=None, - ground_truth_failure_count=None, - ground_truth_ok_asn_cc_count=None, - ground_truth_ok_count=None, - ground_truth_trusted_failure_count=None, - ground_truth_trusted_ok_count=None, - ) - - assert ( - web_o.tcp_failure is not None - ), "inconsistency between tcp_success and tcp_failure" - - ground_truths = filter( - lambda gt: gt.ip == web_o.ip and gt.port == web_o.port, web_ground_truths + return SQL, q_params + + +def get_analysis_web_fuzzy_logic( + db: ClickhouseConnection, + start_time: datetime, + end_time: datetime, + probe_cc: List[str], + # We are only doing web_connectivity for the moment + test_name: List[str] = ["web_connectivity"], + measurement_uid: Optional[str] = None, +): + SQL, q_params = format_query_analysis_web_fuzzy_logic( + start_time=start_time, + end_time=end_time, + probe_cc=probe_cc, + test_name=test_name, + measurement_uid=measurement_uid, ) - unreachable_cc_asn = set() - reachable_cc_asn = set() - - tcp_ground_truth_failure_asn_cc_count = 0 - tcp_ground_truth_failure_count = 0 - tcp_ground_truth_ok_asn_cc_count = 0 - tcp_ground_truth_ok_count = 0 - tcp_ground_truth_trusted_failure_count = 0 - tcp_ground_truth_trusted_ok_count = 0 - - for gt in ground_truths: - if gt.tcp_success is None: - continue - # We don't check for strict == True, since depending on the DB engine - # True could also be represented as 1 - if gt.tcp_success: - if gt.is_trusted_vp: - tcp_ground_truth_trusted_ok_count += gt.count - else: - tcp_ground_truth_ok_count += 1 - reachable_cc_asn.add((gt.vp_cc, gt.vp_asn)) - else: - if gt.is_trusted_vp: - tcp_ground_truth_trusted_failure_count += gt.count - else: - tcp_ground_truth_failure_count += 1 - unreachable_cc_asn.add((gt.vp_cc, gt.vp_asn)) - - tcp_ground_truth_failure_asn_cc_count = len(unreachable_cc_asn) - tcp_ground_truth_ok_asn_cc_count = len(reachable_cc_asn) - - return TCPAnalysis( - address=blocking_subject, - success=False, - failure=web_o.tcp_failure, - ground_truth_failure_asn_cc_count=tcp_ground_truth_failure_asn_cc_count, - ground_truth_failure_count=tcp_ground_truth_failure_count, - ground_truth_ok_asn_cc_count=tcp_ground_truth_ok_asn_cc_count, - ground_truth_ok_count=tcp_ground_truth_ok_count, - ground_truth_trusted_failure_count=tcp_ground_truth_trusted_failure_count, - ground_truth_trusted_ok_count=tcp_ground_truth_trusted_ok_count, + res = db.execute_iter(SQL, params=q_params, with_column_types=True) + column_names = list(map(lambda x: x[0], next(res))) + for row in res: + row = dict(zip(column_names, row)) + yield row + + +def write_analysis_web_fuzzy_logic( + db: ClickhouseConnection, + start_time: datetime, + end_time: datetime, + probe_cc: List[str], + # We are only doing web_connectivity for the moment + test_name: List[str] = ["web_connectivity"], + measurement_uid: Optional[str] = None, +): + SQL, q_params = format_query_analysis_web_fuzzy_logic( + start_time=start_time, + end_time=end_time, + probe_cc=probe_cc, + test_name=test_name, + measurement_uid=measurement_uid, ) - - -@dataclass -class DNSGroundTruth: - nxdomain_count: int - nxdomain_cc_asn: set - failure_cc_asn: set - failure_count: int - ok_cc_asn: set - ok_count: int - other_ips: Dict[str, set] - other_asns: Dict[str, set] - trusted_answers: Dict - - ok_cc_asn_count: int - failure_cc_asn_count: int - nxdomain_cc_asn_count: int - - -def make_dns_ground_truth(ground_truths: Iterable[WebGroundTruth]): - """ - Here we count how many vantage vantage points, as in distinct probe_cc, - probe_asn pairs, presented the various types of results. - """ - nxdomain_cc_asn = set() - failure_cc_asn = set() - ok_cc_asn = set() - other_ips = defaultdict(set) - other_asns = defaultdict(set) - trusted_answers = {} - ok_count = 0 - failure_count = 0 - nxdomain_count = 0 - for gt in ground_truths: - if gt.dns_success is None and gt.dns_failure is None: - continue - - if gt.dns_failure == "dns_nxdomain_error": - nxdomain_count += gt.count - nxdomain_cc_asn.add((gt.vp_cc, gt.vp_asn)) - continue - - if gt.dns_failure is not None: - failure_count += gt.count - failure_cc_asn.add((gt.vp_cc, gt.vp_asn)) - continue - - ok_count += gt.count - ok_cc_asn.add((gt.vp_cc, gt.vp_asn)) - other_ips[gt.ip].add((gt.vp_cc, gt.vp_asn)) - assert gt.ip, "did not find IP in ground truth" - other_asns[gt.ip_asn].add((gt.vp_cc, gt.vp_asn)) - if gt.tls_is_certificate_valid == True or gt.is_trusted_vp == True: - trusted_answers[gt.ip] = gt - - return DNSGroundTruth( - failure_count=failure_count, - ok_count=ok_count, - nxdomain_count=nxdomain_count, - nxdomain_cc_asn=nxdomain_cc_asn, - failure_cc_asn=failure_cc_asn, - ok_cc_asn=ok_cc_asn, - other_asns=other_asns, - other_ips=other_ips, - trusted_answers=trusted_answers, - ok_cc_asn_count=len(ok_cc_asn), - failure_cc_asn_count=len(failure_cc_asn), - nxdomain_cc_asn_count=len(nxdomain_cc_asn), + INSERT_SQL = f""" + INSERT INTO analysis_web_measurement + SELECT * FROM ( + {SQL} ) - - -def dns_observations_by_resolver( - dns_observations: List[WebObservation], -) -> Dict[str, List[WebObservation]]: - by_resolver = defaultdict(list) - for dns_o in dns_observations: - dns_engine = dns_o.dns_engine or "system" - key = f"{dns_engine}-{dns_o.dns_engine_resolver_address}" - by_resolver[key].append(dns_o) - return by_resolver - - -@dataclass -class DNSConsistencyResults: - answers: List[str] - success: bool = False - failure: Optional[str] = None - answer_count: int = 0 - - is_answer_tls_consistent: bool = False - is_answer_tls_inconsistent: bool = False - is_answer_ip_in_trusted_answers: bool = False - is_answer_asn_in_trusted_answers: bool = False - is_answer_asorg_in_trusted_answers: bool = False - is_answer_cloud_provider: bool = False - is_answer_probe_asn_match: bool = False - is_answer_probe_cc_match: bool = False - is_answer_bogon: bool = False - - answer_fp_name: str = "" - answer_fp_scope: str = "" - is_answer_fp_match: bool = False - is_answer_fp_country_consistent: bool = False - is_answer_fp_false_positive: bool = False - - is_resolver_probe_asn_match: bool = False - is_resolver_probe_cc_match: bool = False - - answer_ip_ground_truth_asn_count: int = 0 - answer_asn_ground_truth_asn_count: int = 0 - - -def check_dns_consistency( - dns_observations: List[WebObservation], - dns_ground_truth: DNSGroundTruth, - fingerprintdb: FingerprintDB, -) -> DNSConsistencyResults: """ - Do a web_connectivity style DNS consistency check. - - If we are in this case, it means we weren't able to determine the - consistency of the DNS query using TLS. This is the case either - because the tested site is not in HTTPS and therefore we didn't - generate a TLS measurement for it or because the target IP isn't - listening on HTTPS (which is quite fishy). - In either case we should flag these with being somewhat likely to be - blocked. - """ - consistency_results = DNSConsistencyResults(answers=[]) - - ground_truth_asns = set() - ground_truth_ips = set() - ground_truth_as_org_names = set() - for gt in dns_ground_truth.trusted_answers.values(): - assert gt.ip, f"did not find IP in ground truth {gt.ip}" - ground_truth_ips.add(gt.ip) - ground_truth_asns.add(gt.ip_asn) - ground_truth_as_org_names.add(gt.ip_as_org_name.lower()) - - for web_o in dns_observations: - if web_o.dns_failure == None and web_o.dns_answer: - consistency_results.success = True - consistency_results.answers.append(web_o.dns_answer) - consistency_results.answer_count += 1 - else: - consistency_results.failure = web_o.dns_failure - - fp = fingerprintdb.match_dns(web_o.dns_answer) - - if fp: - consistency_results.is_answer_fp_match = True - if ( - fp.expected_countries - and web_o.probe_meta.probe_cc in fp.expected_countries - ): - consistency_results.is_answer_fp_country_consistent = True - if fp.scope == "fp": - consistency_results.is_answer_fp_false_positive = True - # XXX in the event of multiple matches, we are overriding it with - # the last value. It's probably OK for now. - consistency_results.answer_fp_name = fp.name - consistency_results.answer_fp_scope = fp.scope or "" - - if not web_o.dns_engine or web_o.dns_engine in SYSTEM_RESOLVERS: - # TODO: do the same thing for the non-system resolver - if web_o.probe_meta.resolver_asn == web_o.probe_meta.probe_asn: - consistency_results.is_resolver_probe_asn_match = True - if web_o.probe_meta.resolver_cc == web_o.probe_meta.probe_cc: - consistency_results.is_resolver_probe_cc_match = True - - if web_o.tls_is_certificate_valid == True: - consistency_results.is_answer_tls_consistent = True - - if web_o.tls_is_certificate_valid == False: - consistency_results.is_answer_tls_inconsistent = True - - if web_o.ip_is_bogon: - consistency_results.is_answer_bogon = True - - if web_o.dns_answer_asn in ground_truth_asns: - consistency_results.is_answer_asn_in_trusted_answers = True - - if web_o.dns_answer in ground_truth_ips: - consistency_results.is_answer_ip_in_trusted_answers = True - - if ( - web_o.dns_answer_as_org_name - and web_o.dns_answer_as_org_name.lower() in ground_truth_as_org_names - ): - consistency_results.is_answer_asorg_in_trusted_answers = True - - if web_o.dns_answer in dns_ground_truth.other_ips: - consistency_results.answer_ip_ground_truth_asn_count += len( - dns_ground_truth.other_ips[web_o.dns_answer] - ) - - if web_o.dns_answer in dns_ground_truth.other_asns: - consistency_results.answer_asn_ground_truth_asn_count += len( - dns_ground_truth.other_asns[web_o.dns_answer] - ) - - if is_cloud_provider(asn=web_o.ip_asn, as_org_name=web_o.ip_as_org_name): - consistency_results.is_answer_cloud_provider = True - - if web_o.dns_answer_asn == web_o.probe_meta.probe_asn: - consistency_results.is_answer_probe_asn_match = True - elif web_o.ip_as_cc == web_o.probe_meta.probe_cc: - consistency_results.is_answer_probe_cc_match = True - - return consistency_results - - -@dataclass -class DNSAnalysis: - ground_truth: DNSGroundTruth - - consistency_system: DNSConsistencyResults - consistency_other: Optional[DNSConsistencyResults] - - -def make_dns_analysis( - hostname: str, - dns_observations: List[WebObservation], - web_ground_truths: List[WebGroundTruth], - fingerprintdb: FingerprintDB, -) -> DNSAnalysis: - dns_ground_truth = make_dns_ground_truth( - ground_truths=filter( - lambda gt: gt.hostname == hostname, - web_ground_truths, - ) - ) - dns_consistency_system = None - dns_consistency_other = None - - for resolver_str, dns_observations in dns_observations_by_resolver( - dns_observations - ).items(): - if any([resolver_str.startswith(s) for s in SYSTEM_RESOLVERS]): - dns_consistency_system = check_dns_consistency( - fingerprintdb=fingerprintdb, - dns_observations=dns_observations, - dns_ground_truth=dns_ground_truth, - ) - elif resolver_str.startswith("udp"): - if dns_consistency_other is not None: - log.warn( - f"more than one alternative resolver in query list. overriding. " - f"msmt_uid={dns_observations[0].measurement_meta.measurement_uid}" - ) - dns_consistency_other = check_dns_consistency( - fingerprintdb=fingerprintdb, - dns_observations=dns_observations, - dns_ground_truth=dns_ground_truth, - ) - elif resolver_str.startswith("doh"): - # TODO: currently we ignore doh answers. Maybe we can do something with them in the future. - pass - - assert dns_consistency_system is not None, "could not find system DNS resolution" - - return DNSAnalysis( - ground_truth=dns_ground_truth, - consistency_system=dns_consistency_system, - consistency_other=dns_consistency_other, - ) - - -@dataclass -class TLSAnalysis: - success: bool - failure: Optional[str] - is_tls_certificate_valid: bool - is_tls_certificate_invalid: bool - - handshake_read_count: Optional[int] - handshake_write_count: Optional[int] - handshake_read_bytes: Optional[float] - handshake_write_bytes: Optional[float] - handshake_time: Optional[float] - - ground_truth_failure_count: int = 0 - ground_truth_failure_asn_cc_count: int = 0 - ground_truth_ok_count: int = 0 - ground_truth_ok_asn_cc_count: int = 0 - - ground_truth_trusted_failure_count: int = 0 - ground_truth_trusted_ok_count: int = 0 - - -def make_tls_analysis( - web_o: WebObservation, web_ground_truths: List[WebGroundTruth] -) -> TLSAnalysis: - tls_analysis = TLSAnalysis( - success=web_o.tls_is_certificate_valid == True, - failure=web_o.tls_failure, - is_tls_certificate_valid=web_o.tls_is_certificate_valid == True, - is_tls_certificate_invalid=web_o.tls_is_certificate_valid == False, - handshake_read_count=web_o.tls_handshake_read_count, - handshake_write_count=web_o.tls_handshake_write_count, - handshake_read_bytes=web_o.tls_handshake_read_bytes, - handshake_write_bytes=web_o.tls_handshake_write_bytes, - handshake_time=web_o.tls_handshake_time, - ) - ground_truths = filter( - lambda gt: gt.ip == web_o.ip and gt.port == web_o.port, web_ground_truths - ) - failure_cc_asn = set() - ok_cc_asn = set() - for gt in ground_truths: - # We don't check for strict == True, since depending on the DB engine - # True could also be represented as 1 - if gt.tls_success is None: - continue - - if gt.tls_success: - if gt.is_trusted_vp: - tls_analysis.ground_truth_trusted_ok_count += gt.count - else: - tls_analysis.ground_truth_ok_count += gt.count - ok_cc_asn.add((gt.vp_cc, gt.vp_asn)) - else: - if gt.is_trusted_vp: - tls_analysis.ground_truth_trusted_failure_count += gt.count - else: - tls_analysis.ground_truth_failure_count += gt.count - failure_cc_asn.add((gt.vp_cc, gt.vp_asn, gt.count)) - - tls_analysis.ground_truth_ok_asn_cc_count = len(ok_cc_asn) - tls_analysis.ground_truth_failure_asn_cc_count = len(failure_cc_asn) - - return tls_analysis - - -@dataclass -class HTTPAnalysis: - success: bool - failure: Optional[str] - is_http_request_encrypted: bool - - response_body_proportion: Optional[float] = None - response_body_length: Optional[int] = None - response_status_code: Optional[int] = None - - ground_truth_failure_count: int = 0 - ground_truth_failure_asn_cc_count: int = 0 - ground_truth_ok_count: int = 0 - ground_truth_ok_asn_cc_count: int = 0 - - ground_truth_trusted_ok_count: int = 0 - ground_truth_trusted_failure_count: int = 0 - ground_truth_body_length: int = 0 - - fp_name: str = "" - fp_scope: str = "" - is_http_fp_match: bool = False - is_http_fp_country_consistent: bool = False - is_http_fp_false_positive: bool = False - - -def make_http_analysis( - web_o: WebObservation, - web_ground_truths: List[WebGroundTruth], - body_db: BodyDB, - fingerprintdb: FingerprintDB, -) -> HTTPAnalysis: - assert web_o.http_request_url - - http_analysis = HTTPAnalysis( - success=web_o.http_failure == None, - failure=web_o.http_failure, - is_http_request_encrypted=web_o.http_request_url.startswith("https://"), - ) - - ground_truths = filter( - lambda gt: gt.http_request_url == web_o.http_request_url, web_ground_truths - ) - failure_cc_asn = set() - ok_cc_asn = set() - response_body_len_count = defaultdict(int) - for gt in ground_truths: - # We don't check for strict == True, since depending on the DB engine - # True could also be represented as 1 - if gt.http_success is None: - continue - - # TODO: figure out why some are negative - if gt.http_response_body_length and gt.http_response_body_length > 0: - response_body_len_count[gt.http_response_body_length] += gt.count - - if gt.http_success: - if gt.is_trusted_vp: - http_analysis.ground_truth_trusted_ok_count += gt.count - else: - http_analysis.ground_truth_ok_count += gt.count - ok_cc_asn.add((gt.vp_cc, gt.vp_asn)) - else: - if gt.is_trusted_vp: - http_analysis.ground_truth_trusted_failure_count += gt.count - else: - http_analysis.ground_truth_failure_count += gt.count - failure_cc_asn.add((gt.vp_cc, gt.vp_asn, gt.count)) - - response_body_length = 0 - if len(response_body_len_count) > 0: - response_body_length = max(response_body_len_count.items(), key=lambda x: x[1])[ - 0 - ] - http_analysis.ground_truth_body_length = response_body_length - - # Untrusted Vantage Points (i.e. not control measurements) only count - # once per probe_cc, probe_asn pair to avoid spammy probes poisoning our - # data - http_analysis.ground_truth_failure_asn_cc_count += len(failure_cc_asn) - http_analysis.ground_truth_ok_asn_cc_count += len(ok_cc_asn) - - # TODO: do we care to do something about empty bodies? - # They are commonly a source of blockpages - if web_o.http_response_body_sha1: - matched_fp = body_db.lookup(web_o.http_response_body_sha1) - if len(matched_fp) > 0: - for fp_name in matched_fp: - fp = fingerprintdb.get_fp(fp_name) - if fp.scope: - http_analysis.is_http_fp_match = True - if fp.scope == "fp": - http_analysis.is_http_fp_false_positive = True - if ( - fp.expected_countries - and web_o.probe_meta.probe_cc in fp.expected_countries - ): - http_analysis.is_http_fp_country_consistent = True - if fp.name: - http_analysis.fp_name = fp.name - http_analysis.fp_scope = fp.scope - - if web_o.http_response_body_length: - http_analysis.response_body_length = web_o.http_response_body_length - http_analysis.response_body_proportion = ( - web_o.http_response_body_length + 1.0 - ) / (response_body_length + 1.0) - - http_analysis.response_status_code = web_o.http_response_status_code - return http_analysis - - -def make_web_analysis( - web_observations: List[WebObservation], - web_ground_truths: List[WebGroundTruth], - body_db: BodyDB, - fingerprintdb: FingerprintDB, -) -> Generator[WebAnalysis, None, None]: - domain_name = web_observations[0].hostname or "" - - dns_observations_by_hostname = defaultdict(list) - dns_analysis_by_hostname = {} - other_observations = [] - for web_o in web_observations: - if web_o.dns_query_type: - # assert web_o.hostname is not None, web_o - # TODO(arturo): this is a workaround for: https://github.com/ooni/probe/issues/2628 - if web_o.hostname is None: - log.error( - f"missing hostname for DNS query {web_o}. Skipping DNS observation." - ) - continue - dns_observations_by_hostname[web_o.hostname].append(web_o) - else: - other_observations.append(web_o) - - for hostname, dns_observations in dns_observations_by_hostname.items(): - dns_analysis = make_dns_analysis( - hostname=hostname, - dns_observations=dns_observations, - web_ground_truths=web_ground_truths, - fingerprintdb=fingerprintdb, - ) - dns_analysis_by_hostname[hostname] = dns_analysis - - for idx, web_o in enumerate(web_observations): - target_detail = web_o.http_request_url or domain_name - if web_o.ip: - try: - ipaddr = ipaddress.ip_address(web_o.ip) - # TODO(arturo): for the moment we just ignore all IPv6 results, because they are too noisy - if isinstance(ipaddr, ipaddress.IPv6Address): - continue - address = encode_address(web_o.ip, web_o.port) - target_detail = f"{address} {target_detail}" - except: - log.error(f"Invalid IP in {web_o.ip}") - - dns_analysis = dns_analysis_by_hostname.get(web_o.hostname, None) - - tcp_analysis = None - tls_analysis = None - http_analysis = None - if web_o.tcp_success is not None: - tcp_analysis = make_tcp_analysis( - web_o=web_o, web_ground_truths=web_ground_truths - ) - if web_o.tls_failure or web_o.tls_cipher_suite is not None: - tls_analysis = make_tls_analysis( - web_o=web_o, web_ground_truths=web_ground_truths - ) - - if web_o.http_request_url: - http_analysis = make_http_analysis( - web_o=web_o, - web_ground_truths=web_ground_truths, - body_db=body_db, - fingerprintdb=fingerprintdb, - ) - - created_at = datetime.now(timezone.utc).replace(tzinfo=None) - website_analysis = WebAnalysis( - measurement_meta=web_o.measurement_meta, - probe_meta=web_o.probe_meta, - observation_id=f"{web_o.measurement_meta.measurement_uid}_{web_o.observation_idx}", - created_at=created_at, - analysis_id=f"{web_o.measurement_meta.measurement_uid}_{idx}", - target_domain_name=domain_name, - target_detail=target_detail, - ) - - if dns_analysis: - - website_analysis.dns_consistency_system_answers = ( - dns_analysis.consistency_system.answers - ) - website_analysis.dns_consistency_system_success = ( - dns_analysis.consistency_system.success - ) - website_analysis.dns_consistency_system_failure = ( - dns_analysis.consistency_system.failure - ) - website_analysis.dns_consistency_system_answer_count = ( - dns_analysis.consistency_system.answer_count - ) - website_analysis.dns_consistency_system_is_answer_tls_consistent = ( - dns_analysis.consistency_system.is_answer_tls_consistent - ) - website_analysis.dns_consistency_system_is_answer_tls_inconsistent = ( - dns_analysis.consistency_system.is_answer_tls_inconsistent - ) - website_analysis.dns_consistency_system_is_answer_ip_in_trusted_answers = ( - dns_analysis.consistency_system.is_answer_ip_in_trusted_answers - ) - website_analysis.dns_consistency_system_is_answer_asn_in_trusted_answers = ( - dns_analysis.consistency_system.is_answer_asn_in_trusted_answers - ) - website_analysis.dns_consistency_system_is_answer_asorg_in_trusted_answers = ( - dns_analysis.consistency_system.is_answer_asorg_in_trusted_answers - ) - website_analysis.dns_consistency_system_is_answer_cloud_provider = ( - dns_analysis.consistency_system.is_answer_cloud_provider - ) - website_analysis.dns_consistency_system_is_answer_probe_asn_match = ( - dns_analysis.consistency_system.is_answer_probe_asn_match - ) - website_analysis.dns_consistency_system_is_answer_probe_cc_match = ( - dns_analysis.consistency_system.is_answer_probe_cc_match - ) - website_analysis.dns_consistency_system_is_answer_bogon = ( - dns_analysis.consistency_system.is_answer_bogon - ) - website_analysis.dns_consistency_system_answer_fp_name = ( - dns_analysis.consistency_system.answer_fp_name - ) - website_analysis.dns_consistency_system_answer_fp_scope = ( - dns_analysis.consistency_system.answer_fp_scope - ) - website_analysis.dns_consistency_system_is_answer_fp_match = ( - dns_analysis.consistency_system.is_answer_fp_match - ) - website_analysis.dns_consistency_system_is_answer_fp_country_consistent = ( - dns_analysis.consistency_system.is_answer_fp_country_consistent - ) - website_analysis.dns_consistency_system_is_answer_fp_false_positive = ( - dns_analysis.consistency_system.is_answer_fp_false_positive - ) - website_analysis.dns_consistency_system_is_resolver_probe_asn_match = ( - dns_analysis.consistency_system.is_resolver_probe_asn_match - ) - website_analysis.dns_consistency_system_is_resolver_probe_cc_match = ( - dns_analysis.consistency_system.is_resolver_probe_cc_match - ) - website_analysis.dns_consistency_system_answer_ip_ground_truth_asn_count = ( - dns_analysis.consistency_system.answer_ip_ground_truth_asn_count - ) - website_analysis.dns_consistency_system_answer_asn_ground_truth_asn_count = ( - dns_analysis.consistency_system.answer_asn_ground_truth_asn_count - ) - - website_analysis.dns_ground_truth_failure_count = ( - dns_analysis.ground_truth.failure_count - ) - website_analysis.dns_ground_truth_ok_count = ( - dns_analysis.ground_truth.ok_count - ) - website_analysis.dns_ground_truth_nxdomain_count = ( - dns_analysis.ground_truth.nxdomain_count - ) - website_analysis.dns_ground_truth_ok_cc_asn_count = ( - dns_analysis.ground_truth.ok_cc_asn_count - ) - website_analysis.dns_ground_truth_failure_cc_asn_count = ( - dns_analysis.ground_truth.failure_cc_asn_count - ) - website_analysis.dns_ground_truth_nxdomain_cc_asn_count = ( - dns_analysis.ground_truth.nxdomain_cc_asn_count - ) - - """ - website_analysis.dns_ground_truth_nxdomain_cc_asn = ( - dns_analysis.ground_truth.nxdomain_cc_asn - ) - website_analysis.dns_ground_truth_failure_cc_asn = ( - dns_analysis.ground_truth.failure_cc_asn - ) - website_analysis.dns_ground_truth_ok_cc_asn = ( - dns_analysis.ground_truth.ok_cc_asn - ) - website_analysis.dns_ground_truth_other_ips = ( - dns_analysis.ground_truth.other_ips - ) - website_analysis.dns_ground_truth_other_asns = ( - dns_analysis.ground_truth.other_asns - ) - website_analysis.dns_ground_truth_trusted_answers = ( - dns_analysis.ground_truth.trusted_answers - ) - """ - - if dns_analysis and dns_analysis.consistency_other: - website_analysis.dns_consistency_other_answers = ( - dns_analysis.consistency_other.answers - ) - website_analysis.dns_consistency_other_success = ( - dns_analysis.consistency_other.success - ) - website_analysis.dns_consistency_other_failure = ( - dns_analysis.consistency_other.failure - ) - website_analysis.dns_consistency_other_answer_count = ( - dns_analysis.consistency_other.answer_count - ) - website_analysis.dns_consistency_other_is_answer_tls_consistent = ( - dns_analysis.consistency_other.is_answer_tls_consistent - ) - website_analysis.dns_consistency_other_is_answer_tls_inconsistent = ( - dns_analysis.consistency_other.is_answer_tls_inconsistent - ) - website_analysis.dns_consistency_other_is_answer_ip_in_trusted_answers = ( - dns_analysis.consistency_other.is_answer_ip_in_trusted_answers - ) - website_analysis.dns_consistency_other_is_answer_asn_in_trusted_answers = ( - dns_analysis.consistency_other.is_answer_asn_in_trusted_answers - ) - website_analysis.dns_consistency_other_is_answer_asorg_in_trusted_answers = ( - dns_analysis.consistency_other.is_answer_asorg_in_trusted_answers - ) - website_analysis.dns_consistency_other_is_answer_cloud_provider = ( - dns_analysis.consistency_other.is_answer_cloud_provider - ) - website_analysis.dns_consistency_other_is_answer_probe_asn_match = ( - dns_analysis.consistency_other.is_answer_probe_asn_match - ) - website_analysis.dns_consistency_other_is_answer_probe_cc_match = ( - dns_analysis.consistency_other.is_answer_probe_cc_match - ) - website_analysis.dns_consistency_other_is_answer_bogon = ( - dns_analysis.consistency_other.is_answer_bogon - ) - website_analysis.dns_consistency_other_answer_fp_name = ( - dns_analysis.consistency_other.answer_fp_name - ) - website_analysis.dns_consistency_other_answer_fp_scope = ( - dns_analysis.consistency_other.answer_fp_scope - ) - website_analysis.dns_consistency_other_is_answer_fp_match = ( - dns_analysis.consistency_other.is_answer_fp_match - ) - website_analysis.dns_consistency_other_is_answer_fp_country_consistent = ( - dns_analysis.consistency_other.is_answer_fp_country_consistent - ) - website_analysis.dns_consistency_other_is_answer_fp_false_positive = ( - dns_analysis.consistency_other.is_answer_fp_false_positive - ) - website_analysis.dns_consistency_other_is_resolver_probe_asn_match = ( - dns_analysis.consistency_other.is_resolver_probe_asn_match - ) - website_analysis.dns_consistency_other_is_resolver_probe_cc_match = ( - dns_analysis.consistency_other.is_resolver_probe_cc_match - ) - website_analysis.dns_consistency_other_answer_ip_ground_truth_asn_count = ( - dns_analysis.consistency_other.answer_ip_ground_truth_asn_count - ) - website_analysis.dns_consistency_other_answer_asn_ground_truth_asn_count = ( - dns_analysis.consistency_other.answer_asn_ground_truth_asn_count - ) - if tls_analysis: - website_analysis.tls_success = tls_analysis.success - website_analysis.tls_failure = tls_analysis.failure - website_analysis.tls_is_tls_certificate_valid = ( - tls_analysis.is_tls_certificate_valid - ) - website_analysis.tls_is_tls_certificate_invalid = ( - tls_analysis.is_tls_certificate_invalid - ) - website_analysis.tls_handshake_read_count = ( - tls_analysis.handshake_read_count - ) - website_analysis.tls_handshake_write_count = ( - tls_analysis.handshake_write_count - ) - website_analysis.tls_handshake_read_bytes = ( - tls_analysis.handshake_read_bytes - ) - website_analysis.tls_handshake_write_bytes = ( - tls_analysis.handshake_write_bytes - ) - website_analysis.tls_handshake_time = tls_analysis.handshake_time - website_analysis.tls_ground_truth_failure_count = ( - tls_analysis.ground_truth_failure_count - ) - website_analysis.tls_ground_truth_failure_asn_cc_count = ( - tls_analysis.ground_truth_failure_asn_cc_count - ) - website_analysis.tls_ground_truth_ok_count = ( - tls_analysis.ground_truth_ok_count - ) - website_analysis.tls_ground_truth_ok_asn_cc_count = ( - tls_analysis.ground_truth_ok_asn_cc_count - ) - website_analysis.tls_ground_truth_trusted_failure_count = ( - tls_analysis.ground_truth_trusted_failure_count - ) - website_analysis.tls_ground_truth_trusted_ok_count = ( - tls_analysis.ground_truth_trusted_ok_count - ) - if tcp_analysis: - website_analysis.tcp_address = tcp_analysis.address - website_analysis.tcp_success = tcp_analysis.success - website_analysis.tcp_failure = tcp_analysis.failure - website_analysis.tcp_ground_truth_failure_count = ( - tcp_analysis.ground_truth_failure_count - ) - website_analysis.tcp_ground_truth_failure_asn_cc_count = ( - tcp_analysis.ground_truth_failure_asn_cc_count - ) - website_analysis.tcp_ground_truth_ok_count = ( - tcp_analysis.ground_truth_ok_count - ) - website_analysis.tcp_ground_truth_ok_asn_cc_count = ( - tcp_analysis.ground_truth_ok_asn_cc_count - ) - website_analysis.tcp_ground_truth_trusted_failure_count = ( - tcp_analysis.ground_truth_trusted_failure_count - ) - website_analysis.tcp_ground_truth_trusted_ok_count = ( - tcp_analysis.ground_truth_trusted_ok_count - ) - if http_analysis: - website_analysis.http_success = http_analysis.success - website_analysis.http_failure = http_analysis.failure - website_analysis.http_is_http_request_encrypted = ( - http_analysis.is_http_request_encrypted - ) - website_analysis.http_response_body_length = ( - http_analysis.response_body_length - ) - website_analysis.http_response_body_proportion = ( - http_analysis.response_body_proportion - ) - website_analysis.http_response_status_code = ( - http_analysis.response_status_code - ) - website_analysis.http_ground_truth_failure_count = ( - http_analysis.ground_truth_failure_count - ) - website_analysis.http_ground_truth_failure_asn_cc_count = ( - http_analysis.ground_truth_failure_asn_cc_count - ) - website_analysis.http_ground_truth_ok_count = ( - http_analysis.ground_truth_ok_count - ) - website_analysis.http_ground_truth_ok_asn_cc_count = ( - http_analysis.ground_truth_ok_asn_cc_count - ) - website_analysis.http_ground_truth_trusted_ok_count = ( - http_analysis.ground_truth_trusted_ok_count - ) - website_analysis.http_ground_truth_trusted_failure_count = ( - http_analysis.ground_truth_trusted_failure_count - ) - website_analysis.http_ground_truth_body_length = ( - http_analysis.ground_truth_body_length - ) - website_analysis.http_fp_name = http_analysis.fp_name - website_analysis.http_fp_scope = http_analysis.fp_scope - website_analysis.http_is_http_fp_match = http_analysis.is_http_fp_match - website_analysis.http_is_http_fp_country_consistent = ( - http_analysis.is_http_fp_country_consistent - ) - website_analysis.http_is_http_fp_false_positive = ( - http_analysis.is_http_fp_false_positive - ) - - yield website_analysis + # TODO(art): this is currently a pretty sub-optimal workaround to the whole + # database class needing to be refactored + return db._execute(INSERT_SQL, params=q_params) diff --git a/oonipipeline/src/oonipipeline/analysis/website_experiment_results.py b/oonipipeline/src/oonipipeline/analysis/website_experiment_results.py deleted file mode 100644 index 437f6333..00000000 --- a/oonipipeline/src/oonipipeline/analysis/website_experiment_results.py +++ /dev/null @@ -1,1026 +0,0 @@ -from dataclasses import dataclass -from datetime import datetime, timezone -import logging -from typing import Dict, Generator, List, Optional, Tuple - -from oonidata.models.analysis import WebAnalysis -from oonidata.models.experiment_result import MeasurementExperimentResult - -log = logging.getLogger("oonidata.analysis") - - -def map_analysis_to_target_name(analysis): - # Poormans mapping to target name - # TODO(arturo) we eventually want to look these up in some sort of database that groups together related domains - return analysis.target_domain_name.lstrip("www.") - - -NXDOMAIN_FAILURES = ["android_dns_cache_no_data", "dns_nxdomain_error"] - - -@dataclass -class OutcomeStatus: - key: str - value: float - scope: Optional[str] = None - - -@dataclass -class OutcomeSpace: - dns: Optional[OutcomeStatus] = None - tcp: Optional[OutcomeStatus] = None - tls: Optional[OutcomeStatus] = None - http: Optional[OutcomeStatus] = None - https: Optional[OutcomeStatus] = None - - def to_dict(self) -> Dict[str, float]: - d = {} - if self.dns: - d[self.dns.key] = self.dns.value - if self.tcp: - d[self.tcp.key] = self.tcp.value - if self.tls: - d[self.tls.key] = self.tls.value - if self.http: - d[self.http.key] = self.http.value - if self.https: - d[self.https.key] = self.https.value - return d - - def sum(self) -> float: - return sum([v for v in self.to_dict().values()]) - - def max(self) -> float: - return max([v for v in self.to_dict().values()]) - - def min(self) -> float: - return min([v for v in self.to_dict().values()]) - - -@dataclass -class LoNI: - ok_final: float - - ok: OutcomeSpace - down: OutcomeSpace - blocked: OutcomeSpace - blocking_scope: Optional[str] - - def to_dict(self): - return { - "ok": self.ok.to_dict(), - "down": self.down.to_dict(), - "blocked": self.blocked.to_dict(), - "blocking_scope": self.blocking_scope, - "ok_final": self.ok_final, - } - - -def calculate_web_loni( - web_analysis: WebAnalysis, -) -> Tuple[LoNI, List[str]]: - ok_value = 0 - ok = OutcomeSpace() - down = OutcomeSpace() - blocked = OutcomeSpace() - - # TODO(arturo): make this not nullable - blocking_scope = None - analysis_transcript = [] - - # We start off not knowing anything. - # So basically without any additional information we may as well be rolling - # a 3 sided dice. - # Yes, you can make a 3 sided dice: https://upload.wikimedia.org/wikipedia/commons/3/3b/04ds3.JPG - blocked_key, down_key = None, None - ok_value, down_value, blocked_value = 0.33, 0.33, 0.33 - # We are in the case of a DNS query failing, i.e. we got no answer. - if web_analysis.dns_consistency_system_failure is not None: - """ - Relevant keys for this section of the analysis: - - web_analysis.dns_ground_truth_failure_cc_asn_count - web_analysis.dns_ground_truth_failure_count - web_analysis.dns_ground_truth_nxdomain_count - web_analysis.dns_ground_truth_nxdomain_cc_asn_count - web_analysis.dns_ground_truth_ok_count - web_analysis.dns_ground_truth_ok_cc_asn_count - """ - # For sure, no matter what, the target is having some issue. - ok_value = 0.0 - # We now need to figure out if the failure is because of the target - # being down or if it's blocked. - blocked_key, down_key = "dns", "dns" - blocked_value, down_value = 0.5, 0.5 - dns_ground_truth_failure_count = ( - web_analysis.dns_ground_truth_failure_count or 0 - ) - dns_ground_truth_ok_count = web_analysis.dns_ground_truth_ok_count or 0 - dns_ground_truth_failure_cc_asn_count = ( - web_analysis.dns_ground_truth_failure_cc_asn_count or 0 - ) - dns_ground_truth_ok_cc_asn_count = ( - web_analysis.dns_ground_truth_ok_cc_asn_count or 0 - ) - - # Without any additional information, it could be 50/50 - if web_analysis.dns_consistency_system_failure in NXDOMAIN_FAILURES: - # NXDOMAIN errors are more commonly associated with censorship, let's bump up the blocked_value - blocked_key, down_key = "dns.nxdomain", "dns.nxdomain" - blocked_value = 0.6 - down_value = 1 - blocked_value - analysis_transcript.append( - "web_analysis.dns_consistency_system_failure in NXDOMAIN_FAILURES" - ) - if dns_ground_truth_failure_count > dns_ground_truth_ok_count: - # It's failing more than it's succeeding. This smells like an unreliable site. - blocked_value = 0.3 - down_value = 1 - blocked_value - analysis_transcript.append( - "dns_ground_truth_failure_count > dns_ground_truth_ok_count" - ) - if ( - dns_ground_truth_failure_cc_asn_count - > dns_ground_truth_ok_cc_asn_count - ): - # Even more if that is happening globally - blocked_value = 0.2 - down_value = 1 - blocked_value - analysis_transcript.append( - "dns_ground_truth_failure_cc_asn_count > dns_ground_truth_ok_cc_asn_count" - ) - elif ( - dns_ground_truth_ok_count > 0 - and web_analysis.dns_ground_truth_nxdomain_count == 0 - and web_analysis.dns_ground_truth_nxdomain_cc_asn_count == 0 - ): - # If we never saw a single NXDOMAIN in our ground truth, then - # it's really fishy. Let's bump up the blocking reason. - # TODO(arturo): when we introduce web_obs based ground truthing, - # we should use a threshold here based on the number of metrics - analysis_transcript.append( - "dns_ground_truth_ok_count > 0 and web_analysis.dns_ground_truth_nxdomain_count == 0 and web_analysis.dns_ground_truth_nxdomain_cc_asn_count == 0" - ) - blocked_value = 0.75 - down_value = 1 - blocked_value - else: - analysis_transcript.append( - "web_analysis.dns_consistency_system_failure not in NXDOMAIN_FAILURES" - ) - if dns_ground_truth_failure_count > dns_ground_truth_ok_count: - analysis_transcript.append( - "dns_ground_truth_failure_count > dns_ground_truth_ok_count" - ) - # it's failing more than it's succeeding, more likely to be blocked. - blocked_key, down_key = "dns.failure", "dns.failure" - blocked_value = 0.6 - down_value = 1 - blocked_value - - elif len(web_analysis.dns_consistency_system_answers) > 0: - analysis_transcript.append( - "len(web_analysis.dns_consistency_system_answers) > 0" - ) - # Ok we got some answers. Now we need to figure out if what we got is a - # good answer. - blocked_key = "dns" - down_key = "dns" - blocked_value = 0.5 - ok_value = 0.5 - # No matter what happens, it's not gonna be flagged as down - down_value = 0 - if web_analysis.dns_consistency_system_is_answer_tls_consistent == True: - # "easy" case: we got a TLS consistent answer we can flag it as good - # and move on with our business. - # - # XXX(arturo): there is an important caveat here. We have seen - # cases where you get a surely wrong answer via DNS (eg. a bogon), - # but for some reason you are then still establishing a good TLS - # handshake. Technically it's probably OK to mark these as unblocked - # since eventually you do get the content, but it's worth pointing - # it out. - # Here is a sample measurement for this case: https://explorer.ooni.org/m/20230101013014.694846_AE_webconnectivity_f9f1078ce75936a1 - analysis_transcript.append( - "web_analysis.dns_consistency_system_is_answer_tls_consistent == True" - ) - blocked_value = 0 - down_value = 0 - ok_value = 1 - elif ( - web_analysis.dns_consistency_system_is_answer_fp_match == True - and web_analysis.dns_consistency_system_is_answer_fp_false_positive == False - ): - # TODO(arturo): will we eventually have false positives in the DNS? If so how do we handle them? - # We matched a signature known to be used to implemented censorship. We can mark this as confirmed blocked. - analysis_transcript.append( - "web_analysis.dns_consistency_system_is_answer_fp_match == True and web_analysis.dns_consistency_system_is_answer_fp_false_positive == False" - ) - blocked_key = "dns.confirmed" - blocking_scope = web_analysis.dns_consistency_system_answer_fp_scope - blocked_value = 0.9 - down_value = 0.0 - if ( - web_analysis.dns_consistency_system_is_answer_fp_country_consistent - == True - ): - blocked_key = "dns.confirmed.country_consistent" - blocked_value = 1.0 - down_value = 0.0 - elif ( - web_analysis.dns_consistency_system_is_answer_fp_country_consistent - == False - ): - # If the fingerprint is not country consistent, we consider it down to avoid false positives - blocked_key = "dns.confirmed.not_country_consistent" - down_value = 0.8 - blocked_value = 0.2 - ok_value = 0.0 - elif web_analysis.dns_consistency_system_is_answer_bogon == True: - # Bogons are always fishy, yet we don't know if we see it because - # the site is misconfigured. - # In any case a bogon is not a routable IP, so the target is either - # down or blocked. - analysis_transcript.append( - "web_analysis.dns_consistency_system_is_answer_bogon == True" - ) - blocked_key, down_key = "dns.bogon", "dns.bogon" - blocked_value = 0.5 - down_value = 0.5 - ok_value = 0 - if ( - web_analysis.dns_consistency_system_is_answer_ip_in_trusted_answers - == False - ): - # If we didn't see the bogon in the trusted answers, then it's probably censorship - analysis_transcript.append( - "web_analysis.dns_consistency_system_is_answer_ip_in_trusted_answers == False" - ) - blocked_value = 0.8 - down_value = 1 - blocked_value - ok_value = 0 - elif ( - web_analysis.dns_consistency_system_is_answer_ip_in_trusted_answers - == True - ): - analysis_transcript.append( - "web_analysis.dns_consistency_system_is_answer_ip_in_trusted_answers == True" - ) - # If we did see it in the trusted answers, then we should actually ignore this case and mark the target as down. - blocked_value = 0.2 - down_value = 1 - blocked_value - ok_value = 0 - elif ( - web_analysis.dns_consistency_system_is_answer_ip_in_trusted_answers == True - ): - # Direct hit of the IP in the trusted answers. We nothing to see here. - analysis_transcript.append( - "web_analysis.dns_consistency_system_is_answer_ip_in_trusted_answers == True" - ) - blocked_value = 0.1 - ok_value = 1 - blocked_value - down_value = 0 - elif ( - web_analysis.dns_consistency_system_is_answer_asn_in_trusted_answers == True - or web_analysis.dns_consistency_system_is_answer_asorg_in_trusted_answers - == True - ): - # The ASN or AS org name of the observation matches that of our control. Most likely this is not a case of blocking. - analysis_transcript.append( - "web_analysis.dns_consistency_system_is_answer_asn_in_trusted_answers == True" - ) - blocked_value = 0.2 - ok_value = 1 - blocked_value - down_value = 0 - if web_analysis.dns_consistency_other_is_answer_cloud_provider == True: - # We are even more confident about it not being blocked if it's a cloud provider - analysis_transcript.append( - "web_analysis.dns_consistency_other_is_answer_cloud_provider == True" - ) - blocked_value = 0.1 - ok_value = 1 - blocked_value - down_value = 0 - else: - # We are done with all the simpler cases. We can now move into the - # more sketchy dubious analysis strategies. - # We assume that if we weren't able to determine consistency through - # the several previous methods, we will air on the side of saying - # it's blocked, but marginally. - analysis_transcript.append("not a_simple_case") - blocked_value = 0.6 - ok_value = 1 - blocked_value - down_value = 0 - # TODO(arturo): if we ever add false positive fingerprints to DNS we - # should add case for them here. - if web_analysis.dns_consistency_system_is_answer_probe_cc_match == True: - analysis_transcript.append( - "web_analysis.dns_consistency_system_is_answer_probe_cc_match == True" - ) - # It's common for blockpages to be hosted in the country of where the blocking is happening, let's bump up the blocking score. - blocked_key = "dns.inconsistent" - blocked_value = 0.65 - ok_value = 1 - blocked_value - if ( - web_analysis.dns_consistency_system_is_answer_cloud_provider - == False - ): - # If it's not a cloud provider, even more a reason to believe that's the case. - # TODO(arturo): add a new metric which tells us if the - # target domain is being hosted on a cloud provider and use - # that instead since this metric here will actually never be set to false - analysis_transcript.append( - "web_analysis.dns_consistency_system_is_answer_cloud_provider == False" - ) - blocked_value = 0.75 - ok_value = 1 - blocked_value - elif ( - web_analysis.dns_consistency_system_is_answer_cloud_provider == True - ): - # If it's a cloud provider, this is probably a false positive. - analysis_transcript.append( - "web_analysis.dns_consistency_system_is_answer_cloud_provider == True" - ) - blocked_key = "dns.cloud" - blocked_value = 0.3 - ok_value = 1 - blocked_value - elif ( - web_analysis.dns_consistency_system_is_answer_probe_asn_match - == True - ): - # It's not a cloud provider, but it's in the same network. Somethings up. - analysis_transcript.append( - "web_analysis.dns_consistency_system_is_answer_probe_asn_match == True" - ) - blocked_value = 0.7 - ok_value = 1 - blocked_value - - if blocked_key and down_key: - # We have finished the DNS analysis. We can store the - # final analysis to blocked and down dictionaries. - blocked.dns = OutcomeStatus( - key=blocked_key, value=blocked_value, scope=blocking_scope - ) - down.dns = OutcomeStatus(key=down_key, value=down_value) - ok.dns = OutcomeStatus(key="dns", value=ok_value) - assert ( - round(blocked.sum() + down.sum() + ok_value) == 1 - ), f"{blocked} + {down} + {ok_value} != 1" - if ok_value < 0.5: - # If the DNS analysis is leading us to believe the target is more down - # or blocked, than OK, we better off just call it day and return early. - # If we don't know if the answer we got was DNS consistent, we can't - # really trust the TCP and TLS analysis results. - # TODO(arturo): do we want to have different thresholds here? - analysis_transcript.append(f"ok_value < 0.5 # OK is {ok_value} after DNS") - return ( - LoNI( - ok_final=ok_value, - ok=ok, - blocked=blocked, - down=down, - blocking_scope=blocking_scope, - ), - analysis_transcript, - ) - - # TODO(arturo): convert paper notes into proof to go in here - if web_analysis.tcp_success == True: - # We succeeded via TCP, no matter what there are no TCP level issues - blocked_key, down_key = "tcp", "tcp" - down_value, blocked_value = 0.0, 0.0 - blocked.tcp = OutcomeStatus(key=blocked_key, value=blocked_value) - down.tcp = OutcomeStatus(key=down_key, value=down_value) - ok.tcp = OutcomeStatus(key="tcp", value=1 - (blocked.sum() + down.sum())) - - elif web_analysis.tcp_success == False: - analysis_transcript.append("web_analysis.tcp_success == False") - # No matter what the target is - blocked_key, down_key = "tcp.failure", "tcp.failure" - - down_value, blocked_value = 0.5, 0.5 - tcp_ground_truth_failure_count = ( - web_analysis.tcp_ground_truth_trusted_failure_count or 0 - ) - # TODO(arturo): Here we are only using the trusted ground truths (i.e. the control measurements) - # eventually we want to switch to using other OONI measurements too. - tcp_ground_truth_ok_count = web_analysis.tcp_ground_truth_trusted_ok_count or 0 - tcp_ground_truth_failure_asn_cc_count = ( - web_analysis.tcp_ground_truth_failure_asn_cc_count or 0 - ) - tcp_ground_truth_ok_asn_cc_count = ( - web_analysis.tcp_ground_truth_ok_asn_cc_count or 0 - ) - if tcp_ground_truth_failure_count > tcp_ground_truth_ok_count: - analysis_transcript.append( - "tcp_ground_truth_failure_count > tcp_ground_truth_ok_count" - ) - # It's failing more than it's succeeding. Probably the site is unreliable - blocked_value = 0.3 - down_value = 1 - blocked_value - if tcp_ground_truth_failure_asn_cc_count > tcp_ground_truth_ok_asn_cc_count: - analysis_transcript.append( - "tcp_ground_truth_failure_asn_cc_count > tcp_ground_truth_ok_asn_cc_count" - ) - - # Even more if it's happening globally - blocked_value = 0.2 - down_value = 1 - blocked_value - elif tcp_ground_truth_ok_count > tcp_ground_truth_failure_count: - analysis_transcript.append( - "tcp_ground_truth_ok_count > tcp_ground_truth_failure_count" - ) - # OTOH, if it's mostly working, then this is a sign of blocking - blocked_value = 0.7 - down_value = 1 - blocked_value - if web_analysis.tcp_failure == "connection_reset": - analysis_transcript.append( - 'web_analysis.tcp_failure == "connection_reset"' - ) - # Connection reset is very fishy. Let's bump up the blocking value. - blocked_value = 0.8 - down_value = 1 - blocked_value - elif web_analysis.tcp_failure == "connection_reset": - analysis_transcript.append('web_analysis.tcp_failure == "connection_reset"') - # Connection reset is very fishy. Let's bump up the blocking value. - blocked_value = 0.7 - down_value = 1 - blocked_value - - # Let's set some nice blocking keys - if web_analysis.tcp_failure in ["generic_timeout_error", "timed_out"]: - blocked_key, down_key = "tcp.timeout", "tcp.timeout" - elif web_analysis.tcp_failure == "connection_reset": - blocked_key, down_key = "tcp.connection_reset", "tcp.connection_reset" - else: - blocked_key = f"{blocked_key}.{web_analysis.tcp_failure}" - down_key = f"{down_key}.{web_analysis.tcp_failure}" - - blocked.tcp = OutcomeStatus(key=blocked_key, value=blocked_value * ok_value) - down.tcp = OutcomeStatus(key=down_key, value=down_value * ok_value) - # TODO(arturo): double check this is correct - ok.tcp = OutcomeStatus(key="tcp", value=1 - (blocked.sum() + down.sum())) - - if blocked_key and down_key: - old_ok_value = ok_value - ok_value = 1 - (blocked.sum() + down.sum()) - assert ( - round(blocked.sum() + down.sum() + ok_value) == 1 - ), f"{blocked} + {down} + {ok_value} != 1" - - if ok_value < 0.5: - # If the TCP analysis is leading us to believe the target is more down - # or blocked, than OK, we better off just call it day and return early. - # TODO(arturo): How should we map multiple failure types? This is OK for - # web 0.4, but doesn't apply to wc 0.5 - analysis_transcript.append( - f"ok_value < 0.5 # OK went after TCP from {old_ok_value} -> {ok_value}" - ) - return ( - LoNI( - ok_final=ok_value, - ok=ok, - blocked=blocked, - down=down, - blocking_scope=blocking_scope, - ), - analysis_transcript, - ) - - if web_analysis.tls_success == True: - blocked_key, down_key = "tls", "tls" - down_value, blocked_value = 0.0, 0.0 - blocked.tls = OutcomeStatus(key=blocked_key, value=blocked_value) - down.tls = OutcomeStatus(key=down_key, value=down_value) - - elif web_analysis.tls_success == False: - analysis_transcript.append("web_analysis.tls_success == False") - # No matter what we are in a tls failure case - blocked_key, down_key = "tls.failure", "tls.failure" - - down_value, blocked_value = 0.5, 0.5 - - # TODO(arturo): Here we are only using the trusted ground truths (i.e. - # the control measurements) eventually we want to switch to using other - # OONI measurements too. - tls_ground_truth_failure_count = ( - web_analysis.tls_ground_truth_trusted_failure_count or 0 - ) - tls_ground_truth_ok_count = web_analysis.tls_ground_truth_trusted_ok_count or 0 - tls_ground_truth_failure_asn_cc_count = ( - web_analysis.tls_ground_truth_failure_asn_cc_count or 0 - ) - tls_ground_truth_ok_asn_cc_count = ( - web_analysis.tls_ground_truth_ok_asn_cc_count or 0 - ) - if tls_ground_truth_failure_count > tls_ground_truth_ok_count: - analysis_transcript.append( - "tls_ground_truth_failure_count > tls_ground_truth_ok_count" - ) - # It's failing more than it's succeeding. Probably the site is unreliable - blocked_value = 0.3 - down_value = 1 - blocked_value - if tls_ground_truth_failure_asn_cc_count > tls_ground_truth_ok_asn_cc_count: - analysis_transcript.append( - "tls_ground_truth_failure_asn_cc_count > tls_ground_truth_ok_asn_cc_count" - ) - # Even more if it's happening globally - blocked_value = 0.2 - down_value = 1 - blocked_value - elif tls_ground_truth_ok_count > tls_ground_truth_failure_count: - analysis_transcript.append( - "tls_ground_truth_ok_count > tls_ground_truth_failure_count" - ) - # OTOH, if it's mostly working, then this is a sign of blocking - blocked_value = 0.7 - down_value = 1 - blocked_value - if web_analysis.tls_is_tls_certificate_invalid == True: - analysis_transcript.append( - "web_analysis.tls_is_tls_certificate_invalid == True" - ) - # bad certificate is very fishy. Let's bump up the blocking value. - blocked_value = 0.9 - down_value = 1 - blocked_value - elif web_analysis.tls_failure == "connection_reset": - # bad certificate is very fishy. Let's bump up the blocking value. - analysis_transcript.append( - "web_analysis.tls_failure == 'connection_reset'" - ) - blocked_value = 0.8 - down_value = 1 - blocked_value - - elif web_analysis.tls_is_tls_certificate_invalid == True: - analysis_transcript.append( - "web_analysis.tls_is_tls_certificate_invalid == True" - ) - # bad certificate is very fishy. Let's bump up the blocking value. - blocked_value = 0.8 - down_value = 1 - blocked_value - elif web_analysis.tls_failure == "connection_reset": - # connection_reset very fishy. Let's bump up the blocking value. - analysis_transcript.append("web_analysis.tls_failure == 'connection_reset'") - blocked_value = 0.7 - down_value = 1 - blocked_value - - # Let's set some nice blocking keys - if web_analysis.tls_failure in ["generic_timeout_error", "timed_out"]: - blocked_key, down_key = "tls.timeout", "tls.timeout" - elif web_analysis.tls_failure == "connection_reset": - blocked_key, down_key = "tls.connection_reset", "tls.connection_reset" - else: - blocked_key = f"{blocked_key}.{web_analysis.tls_failure}" - down_key = f"{down_key}.{web_analysis.tls_failure}" - - blocked.tls = OutcomeStatus(key=blocked_key, value=blocked_value * ok_value) - down.tls = OutcomeStatus(key=down_key, value=down_value * ok_value) - # TODO(arturo): double check this is correct - ok.tls = OutcomeStatus(key="tls", value=1 - (blocked.sum() + down.sum())) - - if blocked_key and down_key: - old_ok_value = ok_value - ok_value = 1 - (blocked.sum() + down.sum()) - assert ( - round(blocked.sum() + down.sum() + ok_value) - ) == 1, f"{blocked} + {down} + {ok_value} != 1" - - if ok_value < 0.5: - # If the TLS analysis is leading us to believe the target is more down - # or blocked, than OK, we better off just call it day and return early. - analysis_transcript.append( - f"ok_value < 0.5 # OK went after TLS from {old_ok_value} -> {ok_value}" - ) - return ( - LoNI( - ok_final=ok_value, - ok=ok, - blocked=blocked, - down=down, - blocking_scope=blocking_scope, - ), - analysis_transcript, - ) - - if web_analysis.http_is_http_request_encrypted is not None: - # If the connection is encrypted we will map these to TLS failures, - # since they are equivalent to the TLS level anomalies. - prefix = "http" - if web_analysis.http_is_http_request_encrypted == True: - prefix = "tls" - - # This is the special case to handle the situation where the HTTP - # analysis happens on it's own. Our prior is set to 1.0 - # TODO(arturo): add more details on why this works - if not blocked_key and not down_key: - ok_value = 1.0 - - blocked_key, down_key = prefix, prefix - - if ( - web_analysis.http_is_http_request_encrypted == True - and web_analysis.http_success == True - ): - analysis_transcript.append( - "web_analysis.http_is_http_request_encrypted == True and web_analysis.http_success == True" - ) - down_value, blocked_value = 0.0, 0.0 - - elif ( - web_analysis.http_is_http_request_encrypted == False - and web_analysis.http_success == True - ): - down_value = 0.0 - # We got an answer via HTTP, yet we don't know if the answer is correct. - analysis_transcript.append( - "web_analysis.http_is_http_request_encrypted == False and web_analysis.http_success == True" - ) - if web_analysis.http_is_http_fp_match == True: - # It matches a known fingerprint, we can say stuff - analysis_transcript.append("web_analysis.http_is_http_fp_match == True") - if web_analysis.http_is_http_fp_false_positive == False: - # We matched a signature known to be used to implemented censorship. We can mark this as confirmed blocked. - analysis_transcript.append( - "web_analysis.http_is_http_fp_false_positive == False" - ) - blocked_key = "http.confirmed" - blocking_scope = web_analysis.http_fp_scope - blocked_value = 0.9 - if web_analysis.http_is_http_fp_country_consistent == True: - analysis_transcript.append( - "web_analysis.http_is_http_fp_country_consistent == True" - ) - blocked_key = "http.confirmed.country_consistent" - blocked_value = 1.0 - elif web_analysis.http_is_http_fp_country_consistent == False: - # We let the blocked value be slightly less for cases where the fingerprint is not country consistent - analysis_transcript.append( - "web_analysis.dns_consistency_system_is_answer_fp_country_consistent == False" - ) - blocked_key = "http.confirmed.not_country_consistent" - blocked_value = 0.8 - elif web_analysis.http_is_http_fp_false_positive == True: - blocked_value = 0.0 - elif ( - web_analysis.http_response_body_length is not None - and web_analysis.http_ground_truth_body_length is not None - ): - # We need to apply some fuzzy logic to fingerprint it - # TODO(arturo): in the future can use more features, such as the following - """ - web_analysis.http_response_status_code - web_analysis.http_response_body_proportion - web_analysis.http_response_body_length - web_analysis.http_ground_truth_body_length - """ - http_response_body_length = web_analysis.http_response_body_length or 0 - http_ground_truth_body_length = ( - web_analysis.http_ground_truth_body_length or 0 - ) - body_proportion = (http_response_body_length + 1) / ( - http_ground_truth_body_length + 1 - ) - if body_proportion < 0.7: - analysis_transcript.append( - "(http_response_body_length + 1)/ (http_ground_truth_body_length + 1) < 0.7" - ) - blocked_key = "http.inconsistent.body_length_mismatch" - blocked_value = 0.7 - # TODO(arturo): check if this indeed has the desired effect. - down_value = 0 - - elif web_analysis.http_failure: - analysis_transcript.append(f"web_analysis.http_failure # ok: {ok_value}") - # No matter what we are in a failure case - - blocked_key, down_key = f"{prefix}.failure", f"{prefix}.failure" - down_value, blocked_value = 0.5, 0.5 - - # TODO(arturo): Here we are only using the trusted ground truths (i.e. - # the control measurements) eventually we want to switch to using other - # OONI measurements too. - https_ground_truth_failure_count = ( - web_analysis.http_ground_truth_trusted_failure_count or 0 - ) - https_ground_truth_ok_count = ( - web_analysis.http_ground_truth_trusted_ok_count or 0 - ) - https_ground_truth_failure_asn_cc_count = ( - web_analysis.http_ground_truth_failure_asn_cc_count or 0 - ) - https_ground_truth_ok_asn_cc_count = ( - web_analysis.http_ground_truth_ok_asn_cc_count or 0 - ) - if https_ground_truth_failure_count > https_ground_truth_ok_count: - analysis_transcript.append( - "https_ground_truth_failure_count > https_ground_truth_ok_count" - ) - # It's failing more than it's succeeding. Probably the site is unreliable - blocked_value = 0.3 - down_value = 0.7 - if ( - https_ground_truth_failure_asn_cc_count - > https_ground_truth_ok_asn_cc_count - ): - analysis_transcript.append( - "https_ground_truth_failure_asn_cc_count > https_ground_truth_ok_asn_cc_count" - ) - # Even more if it's happening globally - blocked_value = 0.2 - down_value = 0.8 - elif https_ground_truth_ok_count > https_ground_truth_failure_count: - analysis_transcript.append( - "https_ground_truth_ok_count > https_ground_truth_failure_count" - ) - # OTOH, if it's mostly working, then this is a sign of blocking - blocked_value = 0.7 - down_value = 0.3 - if "ssl_" in web_analysis.http_failure: - analysis_transcript.append('"ssl_" in web_analysis.http_failure') - # bad certificate is very fishy. Let's bump up the blocking value. - blocked_value = 0.9 - down_value = 0.1 - elif web_analysis.http_failure == "connection_reset": - # connection reset is very fishy. Let's bump up the blocking value. - analysis_transcript.append( - 'web_analysis.http_failure == "connection_reset"' - ) - blocked_value = 0.8 - down_value = 0.2 - - elif web_analysis.http_failure == "connection_reset": - # connection_reset very fishy. Let's bump up the blocking value. - analysis_transcript.append( - "web_analysis.http_failure == 'connection_reset'" - ) - blocked_value = 0.7 - down_value = 0.3 - - # Let's set some nice blocking keys - if web_analysis.http_failure in ["generic_timeout_error", "timed_out"]: - blocked_key, down_key = f"{prefix}.timeout", f"{prefix}.timeout" - elif web_analysis.http_failure == "connection_reset": - blocked_key, down_key = ( - f"{prefix}.connection_reset", - f"{prefix}.connection_reset", - ) - else: - blocked_key = f"{blocked_key}.{web_analysis.http_failure}" - down_key = f"{down_key}.{web_analysis.http_failure}" - - if prefix == "tls": - if blocked.tls is not None: - log.debug( - f"overwriting previous TLS blocking status {blocked.tls} - {down.tls} with " - f"blk_val={blocked_value} dwn_val={down_value} " - f"msmt_uid=({web_analysis.measurement_meta.measurement_uid})" - ) - blocked.tls = OutcomeStatus(key=blocked_key, value=blocked_value * ok_value) - down.tls = OutcomeStatus(key=down_key, value=down_value * ok_value) - # TODO(arturo): double check this is correct - ok.tls = OutcomeStatus(key="tls", value=1 - (blocked.sum() + down.sum())) - else: - blocked.http = OutcomeStatus( - key=blocked_key, value=blocked_value * ok_value, scope=blocking_scope - ) - down.http = OutcomeStatus(key=down_key, value=down_value * ok_value) - # TODO(arturo): double check this is correct - ok.http = OutcomeStatus(key="http", value=1 - (blocked.sum() + down.sum())) - - if blocked_key and down_key: - old_ok_value = ok_value - ok_value = 1 - (blocked.sum() + down.sum()) - assert ( - round(blocked.sum() + down.sum() + ok_value) == 1 - ), f"{blocked} + {down} + {ok_value} != 1" - - return ( - LoNI( - ok_final=ok_value, - ok=ok, - blocked=blocked, - down=down, - blocking_scope=blocking_scope, - ), - analysis_transcript, - ) - - -def make_website_experiment_results( - web_analysis: List[WebAnalysis], -) -> Generator[MeasurementExperimentResult, None, None]: - """ - Takes as input a list of web_analysis and outputs a list of - ExperimentResults for the website. - """ - observation_id_list = [] - first_analysis = web_analysis[0] - - measurement_uid = first_analysis.measurement_meta.measurement_uid - timeofday = first_analysis.measurement_meta.measurement_start_time - - target_nettest_group = "websites" - target_category = "MISC" - target_name = map_analysis_to_target_name(first_analysis) - target_domain_name = first_analysis.target_domain_name - target_detail = first_analysis.target_detail - - analysis_transcript_list = [] - loni_list: List[LoNI] = [] - loni_blocked_list: List[OutcomeSpace] = [] - loni_down_list: List[OutcomeSpace] = [] - loni_ok_list: List[OutcomeSpace] = [] - for wa in web_analysis: - loni, analysis_transcript = calculate_web_loni(wa) - log.debug("wa: %s", wa) - log.debug("analysis_transcript: %s", analysis_transcript) - log.debug("loni: %s", loni) - analysis_transcript_list.append(analysis_transcript) - loni_list.append(loni) - loni_blocked_list.append(loni.blocked) - loni_down_list.append(loni.down) - loni_ok_list.append(loni.ok) - - final_blocked = OutcomeSpace() - final_down = OutcomeSpace() - final_ok = OutcomeSpace() - ok_value = 0 - blocking_scope = None - - # TODO(arturo): this section needs to be formalized and verified a bit more - # in depth. Currently it's just a prototype to start seeing how the data - # looks like. - - def get_agg_outcome(loni_list, category, agg_func) -> Optional[OutcomeStatus]: - """ - Returns the min or max outcome status of the specified category given the loni list - """ - try: - return agg_func( - filter( - lambda x: x is not None, - map(lambda x: getattr(x, category), loni_list), - ), - key=lambda d: d.value if d else 0, - ) - except ValueError: - return None - - ### FINAL DNS - max_dns_blocked = get_agg_outcome(loni_blocked_list, "dns", max) - max_dns_down = get_agg_outcome(loni_down_list, "dns", max) - min_dns_ok = get_agg_outcome(loni_ok_list, "dns", min) - - if max_dns_blocked and max_dns_down and min_dns_ok: - ok_value = min_dns_ok.value - final_ok.dns = OutcomeStatus(key="dns", value=min_dns_ok.value) - final_blocked.dns = OutcomeStatus( - key=max_dns_blocked.key, value=max_dns_blocked.value - ) - final_down.dns = OutcomeStatus( - # TODO(arturo): this is overestimating blocking. - key=max_dns_down.key, - value=1 - (min_dns_ok.value + max_dns_blocked.value), - ) - if max_dns_blocked.scope: - # TODO(arturo): set this on the parent OutcomeStatus too - blocking_scope = max_dns_blocked.scope - log.debug(f"DNS done {ok_value}") - - ### FINAL TCP - max_tcp_blocked = get_agg_outcome(loni_blocked_list, "tcp", max) - max_tcp_down = get_agg_outcome(loni_down_list, "tcp", max) - min_tcp_ok = get_agg_outcome(loni_ok_list, "tcp", min) - if max_tcp_blocked and max_tcp_down and min_tcp_ok: - log.debug(f"PERFORMING TCP {ok_value}") - log.debug(f"max_tcp_blocked: {max_tcp_blocked}") - log.debug(f"max_tcp_down: {max_tcp_down}") - log.debug(f"min_tcp_ok: {min_tcp_ok}") - log.debug(f"final_down: {final_down}") - log.debug(f"final_blocked: {final_blocked}") - log.debug(f"final_ok: {final_ok}") - final_blocked.tcp = OutcomeStatus( - key=max_tcp_blocked.key, value=max_tcp_blocked.value * ok_value - ) - final_down.tcp = OutcomeStatus( - key=max_tcp_down.key, - value=(1 - (min_tcp_ok.value + max_tcp_blocked.value)) * ok_value, - ) - final_ok.tcp = OutcomeStatus(key="tcp", value=min_tcp_ok.value) - # TODO(arturo): should we update the DNS down key value in light of the - # fact we notice TCP is bad and hence the answer might have been bad to - # begin with? - old_ok_value = ok_value - ok_value = 1 - (final_blocked.sum() + final_down.sum()) - log.debug(f"TCP done {old_ok_value} -> {ok_value}") - log.debug(f"final_down: {final_down}") - log.debug(f"final_blocked: {final_blocked}") - log.debug(f"final_ok: {final_ok}") - - ### FINAL TLS - max_tls_blocked = get_agg_outcome(loni_blocked_list, "tls", max) - max_tls_down = get_agg_outcome(loni_down_list, "tls", max) - min_tls_ok = get_agg_outcome(loni_ok_list, "tls", min) - if max_tls_blocked and max_tls_down and min_tls_ok: - final_blocked.tls = OutcomeStatus( - key=max_tls_blocked.key, value=max_tls_blocked.value * ok_value - ) - final_down.tls = OutcomeStatus( - key=max_tls_down.key, - value=(1 - (min_tls_ok.value + max_tls_blocked.value)) * ok_value, - ) - final_ok.tls = OutcomeStatus(key="tls", value=min_tls_ok.value) - old_ok_value = ok_value - ok_value = 1 - (final_blocked.sum() + final_down.sum()) - log.debug(f"TLS done {old_ok_value} -> {ok_value}") - log.debug(f"final_down: {final_down}") - log.debug(f"final_blocked: {final_blocked}") - log.debug(f"final_ok: {final_ok}") - - ### FINAL HTTP - max_http_blocked = get_agg_outcome(loni_blocked_list, "http", max) - max_http_down = get_agg_outcome(loni_down_list, "http", max) - min_http_ok = get_agg_outcome(loni_ok_list, "http", min) - - if max_http_blocked and max_http_down and min_http_ok: - final_blocked.http = OutcomeStatus( - key=max_http_blocked.key, value=max_http_blocked.value * ok_value - ) - final_down.http = OutcomeStatus( - key=max_http_down.key, - value=(1 - (min_http_ok.value + max_http_blocked.value)) * ok_value, - ) - final_ok.http = OutcomeStatus(key="http", value=min_http_ok.value) - if max_http_blocked.scope: - if blocking_scope is not None: - log.warning(f"overwriting blocking_scope key: {blocking_scope}") - # TODO(arturo): set this on the parent OutcomeStatus too - blocking_scope = max_http_blocked.scope - - old_ok_value = ok_value - ok_value = 1 - (final_blocked.sum() + final_down.sum()) - log.debug(f"HTTP done {old_ok_value} -> {ok_value}") - log.debug(f"final_down: {final_down}") - log.debug(f"final_blocked: {final_blocked}") - log.debug(f"final_ok: {final_ok}") - - final_loni = LoNI( - ok_final=ok_value, - ok=final_ok, - down=final_down, - blocked=final_blocked, - blocking_scope=blocking_scope, - ) - log.debug(f"final_loni: {final_loni}") - - loni_ok_value = final_ok.min() - - loni_down = final_loni.down.to_dict() - loni_down_keys, loni_down_values = list(loni_down.keys()), list(loni_down.values()) - - loni_blocked = final_loni.blocked.to_dict() - loni_blocked_keys, loni_blocked_values = list(loni_blocked.keys()), list( - loni_blocked.values() - ) - - loni_ok = final_loni.ok.to_dict() - loni_ok_keys, loni_ok_values = list(loni_ok.keys()), list(loni_ok.values()) - - is_anomaly = loni_ok_value < 0.6 - is_confirmed = final_loni.blocked.sum() > 0.9 - - er = MeasurementExperimentResult( - measurement_meta=first_analysis.measurement_meta, - probe_meta=first_analysis.probe_meta, - # Extra info - observation_id_list=observation_id_list, - timeofday=timeofday, - created_at=datetime.now(timezone.utc).replace(tzinfo=None), - # Location info - location_network_type=first_analysis.probe_meta.network_type, - location_network_asn=first_analysis.probe_meta.probe_asn, - location_network_cc=first_analysis.probe_meta.probe_cc, - location_network_as_org_name=first_analysis.probe_meta.probe_as_org_name, - location_network_as_cc=first_analysis.probe_meta.probe_as_cc, - location_resolver_asn=first_analysis.probe_meta.resolver_asn, - location_resolver_as_org_name=first_analysis.probe_meta.resolver_as_org_name, - location_resolver_as_cc=first_analysis.probe_meta.resolver_as_cc, - location_resolver_cc=first_analysis.probe_meta.resolver_cc, - location_blocking_scope=None, - # Target info - target_nettest_group=target_nettest_group, - target_category=target_category, - target_name=target_name, - target_domain_name=target_domain_name, - target_detail=target_detail, - loni_ok_value=loni_ok_value, - loni_down_keys=loni_down_keys, - loni_down_values=loni_down_values, - loni_blocked_keys=loni_blocked_keys, - loni_blocked_values=loni_blocked_values, - loni_ok_keys=loni_ok_keys, - loni_ok_values=loni_ok_values, - loni_list=list(map(lambda x: x.to_dict(), loni_list)), - analysis_transcript_list=analysis_transcript_list, - measurement_count=1, - observation_count=len(web_analysis), - vp_count=1, - anomaly=is_anomaly, - confirmed=is_confirmed, - ) - - yield er diff --git a/oonipipeline/src/oonipipeline/api/routers/aggregate_analysis.py b/oonipipeline/src/oonipipeline/api/routers/aggregate_analysis.py index 6aa79220..25fcb11a 100644 --- a/oonipipeline/src/oonipipeline/api/routers/aggregate_analysis.py +++ b/oonipipeline/src/oonipipeline/api/routers/aggregate_analysis.py @@ -9,10 +9,7 @@ from .utils import get_measurement_start_day_agg, TimeGrains from ..dependencies import ClickhouseClient, get_clickhouse_client from .list_analysis import ( - OONI_DATA_COLS_REMAP, - OONI_DATA_COLS_REMAP_INV, SinceUntil, - test_name_to_group, utc_30_days_ago, utc_today, ) @@ -40,27 +37,19 @@ class DBStats(BaseModel): class AggregationEntry(BaseModel): - anomaly_count: int - confirmed_count: int - failure_count: int - ok_count: int - measurement_count: int - - observation_count: int - vantage_point_count: int + anomaly_count: float + confirmed_count: float + failure_count: float + ok_count: float + measurement_count: float + measurement_start_day: date - loni_down_map: Dict[str, float] - loni_down_value: float - loni_blocked_map: Dict[str, float] - loni_blocked_value: float - # loni_ok_map: Dict[str, float] - loni_ok_value: float + outcome_label: str + outcome_value: float domain: Optional[str] = None probe_cc: Optional[str] = None probe_asn: Optional[int] = None - test_name: Optional[str] = None - class AggregationResponse(BaseModel): # TODO(arturo): these keys are inconsistent with the other APIs @@ -69,8 +58,8 @@ class AggregationResponse(BaseModel): result: List[AggregationEntry] -@router.get("/aggregation", tags=["aggregation"]) -async def get_aggregation( +@router.get("/aggregation/analysis", tags=["aggregation"]) +async def get_aggregation_analysis( db: Annotated[ClickhouseClient, Depends(get_clickhouse_client)], axis_x: Annotated[AggregationKeys, Query()] = "measurement_start_day", axis_y: Annotated[Optional[AggregationKeys], Query()] = None, @@ -84,7 +73,7 @@ async def get_aggregation( since: SinceUntil = utc_30_days_ago(), until: SinceUntil = utc_today(), time_grain: Annotated[TimeGrains, Query()] = "day", - anomaly_sensitivity: Annotated[float, Query()] = 0.7, + anomaly_sensitivity: Annotated[float, Query()] = 0.9, format: Annotated[Literal["JSON", "CSV"], Query()] = "JSON", download: Annotated[bool, Query()] = False, ) -> AggregationResponse: @@ -100,34 +89,34 @@ async def get_aggregation( f"{get_measurement_start_day_agg(time_grain)} as measurement_start_day" ) elif axis_x: - col = OONI_DATA_COLS_REMAP.get(axis_x) - extra_cols[axis_x] = f"{col} as {axis_x}" + extra_cols[axis_x] = axis_x if probe_asn is not None: if isinstance(probe_asn, str) and probe_asn.startswith("AS"): probe_asn = int(probe_asn[2:]) q_args["probe_asn"] = probe_asn - and_clauses.append("location_network_asn = %(probe_asn)d") - extra_cols["probe_asn"] = "location_network_asn as probe_asn" + and_clauses.append("probe_asn = %(probe_asn)d") + extra_cols["probe_asn"] = "probe_asn" if probe_cc is not None: q_args["probe_cc"] = probe_cc - and_clauses.append("location_network_cc = %(probe_cc)s") - extra_cols["probe_cc"] = "location_network_cc as probe_cc" + and_clauses.append("probe_cc = %(probe_cc)s") + extra_cols["probe_cc"] = "probe_cc" if test_name is not None: - q_args["test_name"] = test_name_to_group(test_name) - and_clauses.append("target_nettest_group = %(test_name)s") - extra_cols["test_name"] = "target_nettest_group as test_name" - if category_code is not None: - q_args["category_code"] = category_code - and_clauses.append("target_category_code = %(category_code)s") - extra_cols["category_code"] = "target_category_code as category_code" + q_args["test_name"] = test_name + and_clauses.append("test_name = %(test_name)s") + extra_cols["test_name"] = "test_name" + # if category_code is not None: + # q_args["category_code"] = category_code + # and_clauses.append("%(category_code)s") + # extra_cols["category_code"] = "category_code" if domain is not None: q_args["domain"] = domain - and_clauses.append("target_domain_name = %(domain)s") - extra_cols["domain"] = "target_domain_name as domain" + and_clauses.append("domain = %(domain)s") + extra_cols["domain"] = "domain" if input is not None: - # XXX - pass + q_args["input"] = input + and_clauses.append("input = %(input)s") + extra_cols["input"] = "input" if axis_y: dimension_count += 1 @@ -139,157 +128,166 @@ async def get_aggregation( f"{get_measurement_start_day_agg(time_grain)} as measurement_start_day" ) else: - col = OONI_DATA_COLS_REMAP_INV.get(axis_y) - extra_cols[axis_y] = f"{col} as {axis_y}" + extra_cols[axis_y] = axis_y if since is not None: q_args["since"] = since - and_clauses.append("timeofday >= %(since)s") + and_clauses.append("measurement_start_time >= %(since)s") if until is not None: - and_clauses.append("timeofday <= %(until)s") + and_clauses.append("measurement_start_time <= %(until)s") q_args["until"] = until - q_args["anomaly_sensitivity"] = anomaly_sensitivity - - """ - if anomaly is True: - and_clauses.append("arraySum(loni_blocked_values) > 0.5") - elif anomaly is False: - and_clauses.append("arraySum(loni_blocked_values) <= 0.5") - - if confirmed is True: - and_clauses.append("arraySum(loni_blocked_values) == 1.0") - - if failure is False: - # TODO(arturo): how do we map this onto failure? - pass - """ - where = "" if len(and_clauses) > 0: where += " WHERE " where += " AND ".join(and_clauses) - # TODO(arturo): the sort of this matters. We should be smarter. - base_cols = [ - "loni_down_map", - "loni_blocked_map", - "loni_ok_value", - "loni_down_value", - "loni_blocked_value", - "measurement_count", - "observation_count", - "vantage_point_count", - "confirmed_count", - "anomaly_count", - ] - q = f""" WITH - loni_blocked_weight_avg_map as loni_blocked_map, - loni_down_weight_avg_map as loni_down_map, - arraySum(mapValues(loni_blocked_map)) as loni_blocked_value_avg, - arraySum(mapValues(loni_down_map)) as loni_down_value_avg, - loni_ok_weight_avg_value as loni_ok_value_avg, + mapFilter((k, v) -> v != 0, dns_nok_outcomes) as dns_outcomes, + mapFilter((k, v) -> v != 0, tcp_nok_outcomes) as tcp_outcomes, + mapFilter((k, v) -> v != 0, tls_nok_outcomes) as tls_outcomes, + + arrayZip(mapKeys(dns_outcomes), mapValues(dns_outcomes)) as dns_outcome_list, + arraySum((v) -> v.2, dns_outcome_list) as dns_nok_sum, + arraySort((v) -> -v.2, arrayMap((v) -> (v.1, v.2/dns_nok_sum), dns_outcome_list)) as dns_outcomes_norm, + + arrayZip(mapKeys(tcp_outcomes), mapValues(tcp_outcomes)) as tcp_outcome_list, + arraySum((v) -> v.2, tcp_outcome_list) as tcp_nok_sum, + arraySort((v) -> -v.2, arrayMap((v) -> (v.1, v.2/tcp_nok_sum), tcp_outcome_list)) as tcp_outcomes_norm, + + arrayZip(mapKeys(tls_outcomes), mapValues(tls_outcomes)) as tls_outcome_list, + arraySum((v) -> v.2, tls_outcome_list) as tls_nok_sum, + arraySort((v) -> -v.2, arrayMap((v) -> (v.1, v.2/tls_nok_sum), tls_outcome_list)) as tls_outcomes_norm, + + arraySort( + (v) -> -v.2, + [ + (dns_outcome_nok_label, dns_outcome_nok_value), + (tcp_outcome_nok_label, tcp_outcome_nok_value), + (tls_outcome_nok_label, tls_outcome_nok_value), + IF( + tls_ok_sum = 0 AND tls_outcome_nok_value = 0, + -- Special case for when the tested target was not supporting HTTPS and hence the TLS outcome is not so relevant + ('ok', arrayMin([dns_outcome_ok_value, tcp_outcome_ok_value])), + ('ok', arrayMin([dns_outcome_ok_value, tcp_outcome_ok_value, tls_outcome_ok_value])) + ) + ] + ) as all_outcomes_sorted, - loni_ok_value_avg + loni_down_value_avg + loni_blocked_value_avg as loni_total + arrayConcat(dns_outcomes_norm, tcp_outcomes_norm, tls_outcomes_norm) as all_nok_outcomes, - SELECT + dns_outcomes_norm[1].1 as dns_outcome_nok_label, + dns_outcomes_norm[1].2 as dns_outcome_nok_value, + + tcp_outcomes_norm[1].1 as tcp_outcome_nok_label, + tcp_outcomes_norm[1].2 as tcp_outcome_nok_value, - loni_down_map, - loni_blocked_map, + tls_outcomes_norm[1].1 as tls_outcome_nok_label, + tls_outcomes_norm[1].2 as tls_outcome_nok_value, - -- TODO(arturo): this is a bit ghetto - loni_ok_value_avg / loni_total as loni_ok_value, - loni_down_value_avg / loni_total as loni_down_value, - loni_blocked_value_avg / loni_total as loni_blocked_value, + IF(dns_ok_sum > 0, 1 - dns_outcome_nok_value, 0) as dns_outcome_ok_value, + IF(tcp_ok_sum > 0, 1 - tcp_outcome_nok_value, 0) as tcp_outcome_ok_value, + IF(tls_ok_sum > 0, 1 - tls_outcome_nok_value, 0) as tls_outcome_ok_value, - measurement_count_agg as measurement_count, - observation_count_agg as observation_count, - vantage_point_count, + all_outcomes_sorted[1].1 as final_outcome_label, + IF(final_outcome_label = 'ok', all_outcomes_sorted[1].2, all_outcomes_sorted[1].2) as final_outcome_value - confirmed_count, - anomaly_count, + SELECT - -- Extra columns - {", ".join(extra_cols.keys())} + {",".join(extra_cols.keys())}, + probe_analysis, + all_nok_outcomes as all_outcomes, + final_outcome_label as outcome_label, + final_outcome_value as outcome_value FROM ( WITH - CAST((loni_down_keys, loni_down_values), 'Map(String, Float64)') as loni_down_map, - CAST((loni_blocked_keys, loni_blocked_values), 'Map(String, Float64)') as loni_blocked_map - SELECT - - sumMap(loni_down_map) as loni_down_sum, - countMap(loni_down_map) as loni_down_cnt, - arraySum(mapValues(loni_down_cnt)) as loni_down_cnt_total, - arraySum(mapValues(loni_down_sum)) as loni_down_value_total, - mapApply( - (k, v) -> ( - k, - if( - loni_down_cnt_total == 0 or loni_down_value_total == 0, 0, - toFloat64(v) / toFloat64(loni_down_value_total) * toFloat64(loni_down_cnt[k])/toFloat64(loni_down_cnt_total) + IF(resolver_asn = probe_asn, 1, 0) as is_isp_resolver, + multiIf( + top_dns_failure IN ('android_dns_cache_no_data', 'dns_nxdomain_error'), + 'nxdomain', + coalesce(top_dns_failure, 'got_answer') + ) as dns_failure + SELECT + {",".join(extra_cols.values())}, + + anyHeavy(top_probe_analysis) as probe_analysis, + + sumMap( + map( + CONCAT(IF(is_isp_resolver, 'dns_isp.blocked.', 'dns_other.blocked.'), dns_failure), dns_blocked_max, + CONCAT(IF(is_isp_resolver, 'dns_isp.down.', 'dns_other.down.'), dns_failure), dns_down_max ) - ), - loni_down_sum - ) as loni_down_weight_avg_map, - - sumMap(loni_blocked_map) as loni_blocked_sum, - countMap(loni_blocked_map) as loni_blocked_cnt, - arraySum(mapValues(loni_blocked_cnt)) as loni_blocked_cnt_total, - arraySum(mapValues(loni_blocked_sum)) as loni_blocked_value_total, - mapApply( - (k, v) -> ( - k, - if( - loni_blocked_cnt_total == 0 or loni_blocked_value_total == 0, 0, - toFloat64(v) / toFloat64(loni_blocked_value_total) * toFloat64(loni_blocked_cnt[k]) / toFloat64(loni_blocked_cnt_total) + ) as dns_nok_outcomes, + sum(dns_ok_max) as dns_ok_sum, + + sumMap( + map( + CONCAT('tcp.blocked.', coalesce(top_tcp_failure, '')), tcp_blocked_max, + CONCAT('tcp.down.', coalesce(top_tcp_failure, '')), tcp_down_max ) - ), - loni_blocked_sum - ) as loni_blocked_weight_avg_map, - - sum(loni_ok_value) as loni_ok_total, - COUNT() as loni_ok_cnt, - loni_ok_total/loni_ok_cnt as loni_ok_weight_avg_value, - - SUM(measurement_count) as measurement_count_agg, - SUM(observation_count) as observation_count_agg, - COUNT(DISTINCT - location_network_type, - location_network_asn, - location_network_cc, - location_resolver_asn - ) as vantage_point_count, - - sumIf(measurement_count, arraySum(loni_blocked_values) == 1) as confirmed_count, - sumIf(measurement_count, arraySum(loni_blocked_values) >= %(anomaly_sensitivity)f) as anomaly_count, - - -- Extra columns - {", ".join(extra_cols.values())} - - FROM measurement_experiment_result + ) as tcp_nok_outcomes, + sum(tcp_ok_max) as tcp_ok_sum, + + sumMap( + map( + CONCAT('tls.blocked.', coalesce(top_tls_failure, '')), tls_blocked_max, + CONCAT('tls.down.', coalesce(top_tls_failure, '')), tls_down_max + ) + ) as tls_nok_outcomes, + sum(tls_ok_max) as tls_ok_sum + + FROM ooni.analysis_web_measurement {where} GROUP BY {", ".join(extra_cols.keys())} ORDER BY {", ".join(extra_cols.keys())} ) """ - cols = base_cols + list(extra_cols.keys()) t = PerfTimer() log.info(f"running query {q} with {q_args}") rows = db.execute(q, q_args) + fixed_cols = ["probe_analysis", "all_outcomes", "outcome_label", "outcome_value"] + results: List[AggregationEntry] = [] if rows and isinstance(rows, list): for row in rows: print(row) - d = dict(zip(cols, row)) - d["failure_count"] = 0 - d["ok_count"] = d["measurement_count"] - d["anomaly_count"] - log.debug(f"adding {d}") - results.append(AggregationEntry(**d)) + d = dict(zip(list(extra_cols.keys()) + fixed_cols, row)) + outcome_value = d["outcome_value"] + outcome_label = d["outcome_label"] + anomaly_count = 0 + confirmed_count = 0 + failure_count = 0 + ok_count = 0 + if outcome_label == "ok": + ok_count = outcome_value + elif "blocked." in outcome_label: + if outcome_value >= anomaly_sensitivity: + confirmed_count = outcome_value + else: + anomaly_count = outcome_value + + # Map "down" to failures + else: + failure_count = outcome_value + + entry = AggregationEntry( + anomaly_count=anomaly_count, + confirmed_count=confirmed_count, + failure_count=failure_count, + ok_count=ok_count, + measurement_count=1.0, + measurement_start_day=d["measurement_start_day"], + outcome_label=outcome_label, + outcome_value=outcome_value, + domain=d.get("domain"), + probe_cc=d.get("probe_cc"), + probe_asn=d.get("probe_asn"), + ) + results.append(entry) return AggregationResponse( db_stats=DBStats( bytes=-1, diff --git a/oonipipeline/src/oonipipeline/api/routers/aggregate_observations.py b/oonipipeline/src/oonipipeline/api/routers/aggregate_observations.py index 55c6b8ad..76af6873 100644 --- a/oonipipeline/src/oonipipeline/api/routers/aggregate_observations.py +++ b/oonipipeline/src/oonipipeline/api/routers/aggregate_observations.py @@ -51,9 +51,9 @@ class AggregationResponse(BaseModel): @router.get( - "/observations-aggregate", response_model_exclude_none=True, tags=["aggregation"] + "/aggregation/observations", response_model_exclude_none=True, tags=["aggregation"] ) -async def get_obs_aggregation( +async def get_aggregation_observations( db: Annotated[ClickhouseClient, Depends(get_clickhouse_client)], group_by: Annotated[List[AggregationKeys], Query()] = [ "failure", @@ -168,10 +168,8 @@ async def get_obs_aggregation( {group_by_str} {order_by_str} """ - print(query) entries = [] for row in db.execute_iter(query, params_filter): d = dict(zip(column_keys, row)) - print(d) entries.append(AggregationEntry(**d)) return AggregationResponse(aggregation=entries) diff --git a/oonipipeline/src/oonipipeline/cli/commands.py b/oonipipeline/src/oonipipeline/cli/commands.py index 1e063762..1cc65635 100644 --- a/oonipipeline/src/oonipipeline/cli/commands.py +++ b/oonipipeline/src/oonipipeline/cli/commands.py @@ -209,8 +209,6 @@ async def main(): client=client, probe_cc=probe_cc, test_name=test_name, - clickhouse_url=config.clickhouse_url, - data_dir=config.data_dir, schedule_analysis=analysis, ) diff --git a/oonipipeline/src/oonipipeline/db/create_tables.py b/oonipipeline/src/oonipipeline/db/create_tables.py index ee8efb20..a8a6518d 100644 --- a/oonipipeline/src/oonipipeline/db/create_tables.py +++ b/oonipipeline/src/oonipipeline/db/create_tables.py @@ -17,11 +17,6 @@ import typing from oonidata.models.base import TableModelProtocol -from oonidata.models.experiment_result import ( - ExperimentResult, - MeasurementExperimentResult, -) -from oonidata.models.analysis import WebAnalysis from oonidata.models.observations import ( MeasurementMeta, ProbeMeta, @@ -171,13 +166,49 @@ def format_create_query( WebObservation, WebControlObservation, HTTPMiddleboxObservation, - WebAnalysis, - MeasurementExperimentResult, ] def make_create_queries(): - create_queries = [] + create_queries = [ + ( + """ + CREATE TABLE IF NOT EXISTS fingerprints_dns ( + `name` String, `scope` String, `other_names` String, `location_found` String, `pattern_type` String, + `pattern` String, `confidence_no_fp` String, `expected_countries` String, `source` String, `exp_url` String, `notes` String + ) ENGINE = URL('https://raw.githubusercontent.com/ooni/blocking-fingerprints/main/fingerprints_dns.csv', 'CSV') + """, + "fingerprints_dns", + ), + ( + """ + CREATE TABLE IF NOT EXISTS analysis_web_measurement + ( + `domain` String, + `input` String, + `test_name` String, + `probe_asn` UInt32, + `probe_as_org_name` String, + `probe_cc` String, + `resolver_asn` UInt32, `resolver_as_cc` String, `network_type` String, + `measurement_start_time` DateTime64(3, 'UTC'), + `measurement_uid` String, + `ooni_run_link_id` String, + `top_probe_analysis` Nullable(String), + `top_dns_failure` Nullable(String), + `top_tcp_failure` Nullable(String), `top_tls_failure` Nullable(String), + `dns_blocked_max` Float32, `dns_down_max` Float32, `dns_ok_max` Float32, + `tcp_blocked_max` Float32, `tcp_down_max` Float32, `tcp_ok_max` Float32, + `tls_blocked_max` Float32, `tls_down_max` Float32, `tls_ok_max` Float32 + ) + ENGINE = ReplacingMergeTree + PRIMARY KEY measurement_uid + ORDER BY (measurement_uid, measurement_start_time, probe_cc, probe_asn) + SETTINGS index_granularity = 8192 + """, + "analysis_web_measurement", + ), + ] for model in table_models: table_name = model.__table_name__ create_queries.append( diff --git a/oonipipeline/src/oonipipeline/temporal/activities/analysis.py b/oonipipeline/src/oonipipeline/temporal/activities/analysis.py index 77f85967..f9d038b0 100644 --- a/oonipipeline/src/oonipipeline/temporal/activities/analysis.py +++ b/oonipipeline/src/oonipipeline/temporal/activities/analysis.py @@ -1,213 +1,42 @@ -import dataclasses from dataclasses import dataclass -import pathlib -from datetime import datetime -from typing import Dict, List +from datetime import datetime, timedelta +from typing import List -from oonipipeline.temporal.common import TS_FORMAT -import opentelemetry.trace from temporalio import workflow, activity with workflow.unsafe.imports_passed_through(): import clickhouse_driver - import orjson - - from oonidata.models.analysis import WebAnalysis - from oonidata.models.experiment_result import MeasurementExperimentResult - - from ...analysis.control import BodyDB, WebGroundTruthDB - from ...analysis.datasources import iter_web_observations - from ...analysis.web_analysis import make_web_analysis - from ...analysis.website_experiment_results import make_website_experiment_results + from ...analysis.web_analysis import write_analysis_web_fuzzy_logic from ...db.connections import ClickhouseConnection - from ...fingerprintdb import FingerprintDB + from ...settings import config - from ..common import ( - get_prev_range, - maybe_delete_prev_range, - ) log = activity.logger -def make_cc_batches( - cnt_by_cc: Dict[str, int], - probe_cc: List[str], - parallelism: int, -) -> List[List[str]]: - """ - The goal of this function is to spread the load of each batch of - measurements by probe_cc. This allows us to parallelize analysis on a - per-country basis based on the number of measurements. - We assume that the measurements are uniformly distributed over the tested - interval and then break them up into a number of batches equivalent to the - parallelism count based on the number of measurements in each country. - - Here is a concrete example, suppose we have 3 countries IT, IR, US with 300, - 400, 1000 measurements respectively and a parallelism of 2, we will be - creating 2 batches where the first has in it IT, IR and the second has US. - """ - if len(probe_cc) > 0: - selected_ccs_with_cnt = set(probe_cc).intersection(set(cnt_by_cc.keys())) - if len(selected_ccs_with_cnt) == 0: - raise Exception( - f"No observations for {probe_cc} in the time range. Try adjusting the date range or choosing different countries" - ) - # We remove from the cnt_by_cc all the countries we are not interested in - cnt_by_cc = {k: cnt_by_cc[k] for k in selected_ccs_with_cnt} - - total_obs_cnt = sum(cnt_by_cc.values()) - - # We assume uniform distribution of observations per (country, day) - max_obs_per_batch = total_obs_cnt / parallelism - - # We break up the countries into batches where the count of observations in - # each batch is roughly equal. - # This is done so that we can spread the load based on the countries in - # addition to the time range. - cc_batches = [] - current_cc_batch_size = 0 - current_cc_batch = [] - cnt_by_cc_sorted = sorted(cnt_by_cc.items(), key=lambda x: x[0]) - while cnt_by_cc_sorted: - while current_cc_batch_size <= max_obs_per_batch: - try: - cc, cnt = cnt_by_cc_sorted.pop() - except IndexError: - break - current_cc_batch.append(cc) - current_cc_batch_size += cnt - cc_batches.append(current_cc_batch) - current_cc_batch = [] - current_cc_batch_size = 0 - if len(current_cc_batch) > 0: - cc_batches.append(current_cc_batch) - return cc_batches - - @dataclass class MakeAnalysisParams: probe_cc: List[str] test_name: List[str] - clickhouse: str - data_dir: str - fast_fail: bool day: str @activity.defn -def make_analysis_in_a_day(params: MakeAnalysisParams) -> dict: - data_dir = pathlib.Path(params.data_dir) - clickhouse = params.clickhouse - day = datetime.strptime(params.day, "%Y-%m-%d").date() +def make_analysis_in_a_day(params: MakeAnalysisParams): + day = datetime.strptime(params.day, "%Y-%m-%d") + start_time = day + end_time = day + timedelta(days=1) + probe_cc = params.probe_cc test_name = params.test_name - - tracer = opentelemetry.trace.get_tracer(__name__) - - fingerprintdb = FingerprintDB(datadir=data_dir, download=False) - body_db = BodyDB(db=ClickhouseConnection(clickhouse)) - db_writer = ClickhouseConnection(clickhouse) - db_lookup = ClickhouseConnection(clickhouse) - - # This makes sure that the buffer tables are being flushed so that the - # following queries are accurate - db_writer.execute(f"OPTIMIZE TABLE {WebAnalysis.__table_name__} FINAL") - db_writer.execute( - f"OPTIMIZE TABLE {MeasurementExperimentResult.__table_name__} FINAL" + db = ClickhouseConnection(config.clickhouse_url) + + write_analysis_web_fuzzy_logic( + db=db, + start_time=start_time, + end_time=end_time, + probe_cc=probe_cc, + test_name=test_name, ) - prev_range_list = [ - get_prev_range( - db=db_lookup, - table_name=WebAnalysis.__table_name__, - timestamp=datetime.combine(day, datetime.min.time()).strftime(TS_FORMAT), - test_name=[], - probe_cc=probe_cc, - timestamp_column="measurement_start_time", - ), - get_prev_range( - db=db_lookup, - table_name=MeasurementExperimentResult.__table_name__, - timestamp=datetime.combine(day, datetime.min.time()).strftime(TS_FORMAT), - test_name=[], - probe_cc=probe_cc, - timestamp_column="timeofday", - probe_cc_column="location_network_cc", - ), - ] - - log.info(f"loading ground truth DB for {day}") - with tracer.start_span("MakeObservations:load_ground_truths") as span: - ground_truth_db_path = ( - data_dir / "ground_truths" / f"web-{day.strftime('%Y-%m-%d')}.sqlite3" - ) - web_ground_truth_db = WebGroundTruthDB() - web_ground_truth_db.build_from_existing(str(ground_truth_db_path.absolute())) - log.info(f"loaded ground truth DB for {day}") - span.add_event(f"loaded ground truth DB for {day}") - span.set_attribute("day", day.strftime("%Y-%m-%d")) - span.set_attribute("ground_truth_row_count", web_ground_truth_db.count_rows()) - - failures = 0 - no_exp_results = 0 - observation_count = 0 - with tracer.start_span("MakeObservations:iter_web_observations") as span: - for web_obs in iter_web_observations( - db_lookup, - measurement_day=day, - probe_cc=probe_cc, - test_name="web_connectivity", - ): - try: - relevant_gts = web_ground_truth_db.lookup_by_web_obs(web_obs=web_obs) - except: - log.error( - f"failed to lookup relevant_gts for {web_obs[0].measurement_meta.measurement_uid}", - exc_info=True, - ) - failures += 1 - continue - - try: - website_analysis = list( - make_web_analysis( - web_observations=web_obs, - body_db=body_db, - web_ground_truths=relevant_gts, - fingerprintdb=fingerprintdb, - ) - ) - if len(website_analysis) == 0: - log.info(f"no website analysis for {probe_cc}, {test_name}") - no_exp_results += 1 - continue - - observation_count += 1 - - db_writer.write_table_model_rows(website_analysis) - db_writer.write_table_model_rows( - make_website_experiment_results(website_analysis) - ) - - except: - web_obs_ids = ",".join( - map(lambda wo: wo.measurement_meta.measurement_uid, web_obs) - ) - log.error( - f"failed to generate analysis for {web_obs_ids}", exc_info=True - ) - failures += 1 - - span.set_attribute("total_failure_count", failures) - span.set_attribute("total_observation_count", observation_count) - span.set_attribute("no_experiment_results_count", no_exp_results) - span.set_attribute("day", day.strftime("%Y-%m-%d")) - span.set_attribute("probe_cc", probe_cc) - - for prev_range in prev_range_list: - maybe_delete_prev_range(db=db_lookup, prev_range=prev_range) - db_writer.close() - - return {"count": observation_count} diff --git a/oonipipeline/src/oonipipeline/temporal/activities/common.py b/oonipipeline/src/oonipipeline/temporal/activities/common.py index c79df876..c14dea8c 100644 --- a/oonipipeline/src/oonipipeline/temporal/activities/common.py +++ b/oonipipeline/src/oonipipeline/temporal/activities/common.py @@ -10,7 +10,6 @@ from oonipipeline.db.create_tables import make_create_queries from oonipipeline.netinfo import NetinfoDB -from oonipipeline.temporal.common import wait_for_mutations from temporalio import activity DATETIME_UTC_FORMAT = "%Y-%m-%dT%H:%M%SZ" @@ -79,32 +78,3 @@ def update_assets( log.info( f"skipping updating netinfodb because {last_updated_delta} < {refresh_hours}h" ) - - -@dataclass -class ObsCountParams: - clickhouse_url: str - # TODO(art): we should also be using test_name here - # test_name: List[str] - start_day: str - end_day: str - table_name: str = "obs_web" - - -@activity.defn -def get_obs_count_by_cc( - params: ObsCountParams, -) -> Dict[str, int]: - with ClickhouseConnection(params.clickhouse_url) as db: - q = f""" - SELECT - probe_cc, COUNT() - FROM {params.table_name} - WHERE measurement_start_time > %(start_day)s AND measurement_start_time < %(end_day)s - GROUP BY probe_cc - """ - cc_list: List[Tuple[str, int]] = db.execute( - q, {"start_day": params.start_day, "end_day": params.end_day} - ) # type: ignore - assert isinstance(cc_list, list) - return dict(cc_list) diff --git a/oonipipeline/src/oonipipeline/temporal/activities/ground_truths.py b/oonipipeline/src/oonipipeline/temporal/activities/ground_truths.py deleted file mode 100644 index 739b0573..00000000 --- a/oonipipeline/src/oonipipeline/temporal/activities/ground_truths.py +++ /dev/null @@ -1,58 +0,0 @@ -from dataclasses import dataclass -import pathlib -import logging - -from datetime import datetime - -from temporalio import workflow, activity - -with workflow.unsafe.imports_passed_through(): - import clickhouse_driver - - from oonidata.datautils import PerfTimer - from ...analysis.control import WebGroundTruthDB, iter_web_ground_truths - from ...netinfo import NetinfoDB - from ...db.connections import ( - ClickhouseConnection, - ) - -log = activity.logger - - -@dataclass -class MakeGroundTruthsParams: - clickhouse: str - data_dir: str - day: str - force_rebuild: bool = False - - -def get_ground_truth_db_path(data_dir: str, day: str): - ground_truth_dir = pathlib.Path(data_dir) / "ground_truths" - ground_truth_dir.mkdir(exist_ok=True) - return ground_truth_dir / f"web-{day}.sqlite3" - - -@activity.defn -def make_ground_truths_in_day(params: MakeGroundTruthsParams): - clickhouse = params.clickhouse - - db = ClickhouseConnection(clickhouse) - netinfodb = NetinfoDB(datadir=pathlib.Path(params.data_dir), download=False) - - dst_path = get_ground_truth_db_path(data_dir=params.data_dir, day=params.day) - - if dst_path.exists() and params.force_rebuild: - dst_path.unlink() - elif dst_path.exists(): - return - - t = PerfTimer() - day = datetime.strptime(params.day, "%Y-%m-%d").date() - log.info(f"building ground truth DB for {day}") - web_ground_truth_db = WebGroundTruthDB(connect_str=str(dst_path.absolute())) - web_ground_truth_db.build_from_rows( - rows=iter_web_ground_truths(db=db, measurement_day=day, netinfodb=netinfodb) - ) - web_ground_truth_db.close() - log.info(f"built ground truth DB {day} in {t.pretty}") diff --git a/oonipipeline/src/oonipipeline/temporal/activities/observations.py b/oonipipeline/src/oonipipeline/temporal/activities/observations.py index df957738..2f62a5c5 100644 --- a/oonipipeline/src/oonipipeline/temporal/activities/observations.py +++ b/oonipipeline/src/oonipipeline/temporal/activities/observations.py @@ -13,11 +13,6 @@ from oonidata.models.nettests import SupportedDataformats from oonipipeline.db.connections import ClickhouseConnection from oonipipeline.netinfo import NetinfoDB -from oonipipeline.temporal.common import ( - PrevRange, - get_prev_range, - maybe_delete_prev_range, -) from oonipipeline.temporal.activities.common import process_pool_executor, update_assets from oonipipeline.settings import config from opentelemetry import trace @@ -59,6 +54,21 @@ class MakeObservationsFileEntryBatch: fast_fail: bool +def write_observations_to_db( + db: ClickhouseConnection, + netinfodb: NetinfoDB, + bucket_date: str, + msmt: SupportedDataformats, +): + obs_tuple = measurement_to_observations( + msmt=msmt, + netinfodb=netinfodb, + bucket_date=bucket_date, + ) + for obs_list in obs_tuple: + db.write_table_model_rows(obs_list) + + def make_observations_for_file_entry( db: ClickhouseConnection, netinfodb: NetinfoDB, @@ -91,13 +101,9 @@ def make_observations_for_file_entry( continue try: msmt = load_measurement(msmt_dict) - obs_tuple = measurement_to_observations( - msmt=msmt, - netinfodb=netinfodb, - bucket_date=bucket_date, + write_observations_to_db( + db=db, netinfodb=netinfodb, bucket_date=bucket_date, msmt=msmt ) - for obs_list in obs_tuple: - db.write_table_model_rows(obs_list) measurement_count += 1 except Exception as exc: log.error(f"failed at idx: {measurement_count} ({msmt_str})", exc_info=True) @@ -244,45 +250,3 @@ async def make_observations(params: MakeObservationsParams) -> MakeObservationsR "measurement_per_sec": measurement_count / tbatch.s, "total_size": batches["total_size"], } - - -@dataclass -class GetPreviousRangeParams: - clickhouse: str - bucket_date: str - test_name: List[str] - probe_cc: List[str] - tables: List[str] - - -@activity.defn -def get_previous_range(params: GetPreviousRangeParams) -> List[PrevRange]: - with ClickhouseConnection(params.clickhouse) as db: - prev_ranges = [] - for table_name in params.tables: - prev_ranges.append( - get_prev_range( - db=db, - table_name=table_name, - bucket_date=params.bucket_date, - test_name=params.test_name, - probe_cc=params.probe_cc, - ), - ) - return prev_ranges - - -@dataclass -class DeletePreviousRangeParams: - clickhouse: str - previous_ranges: List[PrevRange] - - -@activity.defn -def delete_previous_range(params: DeletePreviousRangeParams) -> List[str]: - delete_queries = [] - with ClickhouseConnection(params.clickhouse) as db: - for pr in params.previous_ranges: - log.info("deleting previous range of {pr}") - delete_queries.append(maybe_delete_prev_range(db=db, prev_range=pr)) - return delete_queries diff --git a/oonipipeline/src/oonipipeline/temporal/common.py b/oonipipeline/src/oonipipeline/temporal/common.py index 4ece122b..e130ef33 100644 --- a/oonipipeline/src/oonipipeline/temporal/common.py +++ b/oonipipeline/src/oonipipeline/temporal/common.py @@ -1,11 +1,6 @@ -from dataclasses import dataclass import logging -from datetime import datetime, timedelta - -import time from typing import ( - Any, Callable, Dict, List, @@ -13,166 +8,10 @@ Tuple, ) -from ..db.connections import ClickhouseConnection - log = logging.getLogger("oonidata.processing") TS_FORMAT = "%Y-%m-%d %H:%M:%S" -@dataclass -class BatchParameters: - test_name: List[str] - probe_cc: List[str] - bucket_date: Optional[str] - timestamp: Optional[str] - - -@dataclass -class PrevRange: - table_name: str - batch_parameters: BatchParameters - timestamp_column: Optional[str] - probe_cc_column: Optional[str] - max_created_at: Optional[str] = None - min_created_at: Optional[str] = None - - def format_query(self): - start_timestamp = None - end_timestamp = None - where = None - where = "WHERE " - q_args: Dict[str, Any] = {} - - if self.batch_parameters.bucket_date: - where = "WHERE bucket_date = %(bucket_date)s" - q_args["bucket_date"] = self.batch_parameters.bucket_date - - elif self.batch_parameters.timestamp: - start_timestamp = datetime.strptime( - self.batch_parameters.timestamp, TS_FORMAT - ) - end_timestamp = start_timestamp + timedelta(days=1) - q_args["start_timestamp"] = start_timestamp - q_args["end_timestamp"] = end_timestamp - where += f"{self.timestamp_column} >= %(start_timestamp)s AND {self.timestamp_column} < %(end_timestamp)s" - else: - raise Exception("Must specify either bucket_date or timestamp") - - if len(self.batch_parameters.test_name) > 0: - where += " AND test_name IN %(test_names)s" - q_args["test_names"] = self.batch_parameters.test_name - if len(self.batch_parameters.probe_cc) > 0: - where += f" AND {self.probe_cc_column} IN %(probe_ccs)s" - q_args["probe_ccs"] = self.batch_parameters.probe_cc - - return where, q_args - - -def wait_for_mutations(db, table_name): - while True: - res = db.execute( - f"SELECT * FROM system.mutations WHERE is_done=0 AND table='{table_name}';" - ) - if len(res) == 0: # type: ignore - break - time.sleep(1) - - -def maybe_delete_prev_range(db: ClickhouseConnection, prev_range: PrevRange) -> str: - """ - We perform a lightweight delete of all the rows which have been - regenerated, so we don't have any duplicates in the table - """ - if not prev_range.max_created_at or not prev_range.min_created_at: - return "" - - # Before deleting, we need to wait for all the mutations to be done - wait_for_mutations(db, prev_range.table_name) - where, q_args = prev_range.format_query() - - q_args["max_created_at"] = prev_range.max_created_at - q_args["min_created_at"] = prev_range.min_created_at - where = f"{where} AND created_at <= %(max_created_at)s AND created_at >= %(min_created_at)s" - log.debug(f"runing {where} with {q_args}") - - final_query = f"ALTER TABLE {prev_range.table_name} DELETE {where}" - db.execute(final_query, q_args) - return final_query - - -def get_prev_range( - db: ClickhouseConnection, - table_name: str, - test_name: List[str], - probe_cc: List[str], - bucket_date: Optional[str] = None, - timestamp: Optional[str] = None, - timestamp_column: str = "timestamp", - probe_cc_column: str = "probe_cc", -) -> PrevRange: - """ - We lookup the range of previously generated rows so we can drop - them from the database once we have finished processing. - - We can't rely just on deduplication happening at the clickhouse level, - because we might in the future add or remove certain rows, so it's - more robust to just drop them once we are done reprocessing. - - Moreover, you don't have any guarantee on when the deduplication is - happening, which means that if you run queries while the reprocessing is - happening you don't know when exactly it's going to be safe to run - deduplcated queries on the DB. - - For observation tables we use the bucket_date field. For experiment results - we use a range of timestamp in a day. - In both cases we delimit the range via the created_at column and any - additional filters that may have been applied to the reprocessing process. - - TODO: while the reprocessing is running we should probably flag this - bucket as reprocessing in progress and guard against running queries for - it. - """ - # A batch specified by test_name, probe_cc and one of either bucket_date or - # timestamp depending on it being observations or experiment results. - assert ( - timestamp or bucket_date - ), "either timestamp or bucket_date should be provided" - prev_range = PrevRange( - table_name=table_name, - batch_parameters=BatchParameters( - test_name=test_name, - probe_cc=probe_cc, - timestamp=timestamp, - bucket_date=bucket_date, - ), - timestamp_column=timestamp_column, - probe_cc_column=probe_cc_column, - ) - - q = f"SELECT MAX(created_at), MIN(created_at) FROM {prev_range.table_name} " - where, q_args = prev_range.format_query() - final_query = q + where - prev_obs_range = db.execute(final_query, q_args) - assert isinstance(prev_obs_range, list) and len(prev_obs_range) == 1 - max_created_at, min_created_at = prev_obs_range[0] - - # We pad it by 1 second to take into account the time resolution downgrade - # happening when going from clickhouse to python data types - if max_created_at and min_created_at: - prev_range.max_created_at = ( - (max_created_at + timedelta(seconds=1)) - .replace(tzinfo=None) - .strftime(TS_FORMAT) - ) - prev_range.min_created_at = ( - (min_created_at - timedelta(seconds=1)) - .replace(tzinfo=None) - .strftime(TS_FORMAT) - ) - - return prev_range - - def make_db_rows( dc_list: List, column_names: List[str], diff --git a/oonipipeline/src/oonipipeline/temporal/schedules.py b/oonipipeline/src/oonipipeline/temporal/schedules.py index f1f8db7b..602f095a 100644 --- a/oonipipeline/src/oonipipeline/temporal/schedules.py +++ b/oonipipeline/src/oonipipeline/temporal/schedules.py @@ -83,8 +83,6 @@ async def schedule_all( client: TemporalClient, probe_cc: List[str], test_name: List[str], - clickhouse_url: str, - data_dir: str, schedule_analysis: bool = True, ) -> ScheduleIdMap: schedule_id_map = ScheduleIdMap() @@ -106,8 +104,6 @@ async def schedule_all( obs_params = ObservationsWorkflowParams( probe_cc=probe_cc, test_name=test_name, - clickhouse=clickhouse_url, - data_dir=data_dir, fast_fail=False, ) sched_handle = await client.create_schedule( @@ -141,9 +137,6 @@ async def schedule_all( analysis_params = AnalysisWorkflowParams( probe_cc=probe_cc, test_name=test_name, - clickhouse=clickhouse_url, - data_dir=data_dir, - fast_fail=False, ) sched_handle = await client.create_schedule( id=f"{ANALYSIS_SCHED_PREFIX}-{filter_id}-{ts}", diff --git a/oonipipeline/src/oonipipeline/temporal/workers.py b/oonipipeline/src/oonipipeline/temporal/workers.py index 2622c96b..3055dbab 100644 --- a/oonipipeline/src/oonipipeline/temporal/workers.py +++ b/oonipipeline/src/oonipipeline/temporal/workers.py @@ -6,14 +6,10 @@ from oonipipeline.temporal.activities.analysis import make_analysis_in_a_day from oonipipeline.temporal.activities.common import ( - get_obs_count_by_cc, optimize_all_tables, optimize_tables, ) -from oonipipeline.temporal.activities.ground_truths import make_ground_truths_in_day from oonipipeline.temporal.activities.observations import ( - delete_previous_range, - get_previous_range, make_observations, ) from oonipipeline.temporal.client_operations import ( @@ -23,29 +19,23 @@ ) from oonipipeline.temporal.workflows.common import TASK_QUEUE_NAME from oonipipeline.temporal.workflows.analysis import AnalysisWorkflow -from oonipipeline.temporal.workflows.ctrl import GroundTruthsWorkflow from oonipipeline.temporal.workflows.observations import ObservationsWorkflow log = logging.getLogger("oonipipeline.workers") -from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, Executor +from concurrent.futures import ThreadPoolExecutor, Executor interrupt_event = asyncio.Event() WORKFLOWS = [ ObservationsWorkflow, - GroundTruthsWorkflow, AnalysisWorkflow, ] ACTIVTIES = [ - delete_previous_range, - get_previous_range, make_observations, - make_ground_truths_in_day, make_analysis_in_a_day, optimize_all_tables, - get_obs_count_by_cc, optimize_tables, ] diff --git a/oonipipeline/src/oonipipeline/temporal/workflows/analysis.py b/oonipipeline/src/oonipipeline/temporal/workflows/analysis.py index 14715dab..be788951 100644 --- a/oonipipeline/src/oonipipeline/temporal/workflows/analysis.py +++ b/oonipipeline/src/oonipipeline/temporal/workflows/analysis.py @@ -6,24 +6,13 @@ from temporalio import workflow +from temporalio.common import RetryPolicy with workflow.unsafe.imports_passed_through(): from oonidata.datautils import PerfTimer from oonipipeline.temporal.activities.analysis import ( MakeAnalysisParams, make_analysis_in_a_day, - make_cc_batches, - ) - from oonipipeline.temporal.activities.common import ( - ClickhouseParams, - OptimizeTablesParams, - ObsCountParams, - get_obs_count_by_cc, - optimize_all_tables, - ) - from oonipipeline.temporal.activities.ground_truths import ( - MakeGroundTruthsParams, - make_ground_truths_in_day, ) from oonipipeline.temporal.workflows.common import ( MAKE_ANALYSIS_START_TO_CLOSE_TIMEOUT, @@ -35,13 +24,7 @@ class AnalysisWorkflowParams: probe_cc: List[str] test_name: List[str] - clickhouse: str - data_dir: str - parallelism: int = 10 - fast_fail: bool = False day: Optional[str] = None - force_rebuild_ground_truths: bool = False - log_level: int = logging.INFO @workflow.defn @@ -52,68 +35,15 @@ async def run(self, params: AnalysisWorkflowParams) -> dict: params.day = (get_workflow_start_time() - timedelta(days=1)).strftime( "%Y-%m-%d" ) - - await workflow.execute_activity( - optimize_all_tables, - ClickhouseParams(clickhouse_url=params.clickhouse), - start_to_close_timeout=timedelta(minutes=5), - ) - - workflow.logger.info("building ground truth databases") - t = PerfTimer() - await workflow.execute_activity( - make_ground_truths_in_day, - MakeGroundTruthsParams( - clickhouse=params.clickhouse, - data_dir=params.data_dir, + make_analysis_in_a_day, + MakeAnalysisParams( + probe_cc=params.probe_cc, + test_name=params.test_name, day=params.day, - force_rebuild=params.force_rebuild_ground_truths, ), - start_to_close_timeout=timedelta(minutes=30), + start_to_close_timeout=MAKE_ANALYSIS_START_TO_CLOSE_TIMEOUT, + retry_policy=RetryPolicy(maximum_attempts=3), ) - workflow.logger.info(f"built ground truth db in {t.pretty}") - - start_day = datetime.strptime(params.day, "%Y-%m-%d").date() - cnt_by_cc = await workflow.execute_activity( - get_obs_count_by_cc, - ObsCountParams( - clickhouse_url=params.clickhouse, - start_day=start_day.strftime("%Y-%m-%d"), - end_day=(start_day + timedelta(days=1)).strftime("%Y-%m-%d"), - ), - start_to_close_timeout=timedelta(minutes=30), - ) - - cc_batches = make_cc_batches( - cnt_by_cc=cnt_by_cc, - probe_cc=params.probe_cc, - parallelism=params.parallelism, - ) - - workflow.logger.info( - f"starting processing of {len(cc_batches)} batches for {params.day} days (parallelism = {params.parallelism})" - ) - workflow.logger.info(f"({cc_batches})") - - task_list = [] - async with asyncio.TaskGroup() as tg: - for probe_cc in cc_batches: - task = tg.create_task( - workflow.execute_activity( - make_analysis_in_a_day, - MakeAnalysisParams( - probe_cc=probe_cc, - test_name=params.test_name, - clickhouse=params.clickhouse, - data_dir=params.data_dir, - fast_fail=params.fast_fail, - day=params.day, - ), - start_to_close_timeout=MAKE_ANALYSIS_START_TO_CLOSE_TIMEOUT, - ) - ) - task_list.append(task) - total_obs_count = sum(map(lambda x: x.result()["count"], task_list)) - return {"obs_count": total_obs_count, "day": params.day} + return {"day": params.day} diff --git a/oonipipeline/src/oonipipeline/temporal/workflows/ctrl.py b/oonipipeline/src/oonipipeline/temporal/workflows/ctrl.py deleted file mode 100644 index 1a8dbcaf..00000000 --- a/oonipipeline/src/oonipipeline/temporal/workflows/ctrl.py +++ /dev/null @@ -1,49 +0,0 @@ -import asyncio -from dataclasses import dataclass - -from datetime import datetime, timedelta - -from temporalio import workflow - -with workflow.unsafe.imports_passed_through(): - from oonidata.dataclient import date_interval - from oonipipeline.temporal.activities.ground_truths import ( - MakeGroundTruthsParams, - make_ground_truths_in_day, - ) - from oonipipeline.temporal.workflows.common import ( - MAKE_GROUND_TRUTHS_START_TO_CLOSE_TIMEOUT, - ) - - -@dataclass -class GroundTruthsWorkflowParams: - start_day: str - end_day: str - clickhouse: str - data_dir: str - - -@workflow.defn -class GroundTruthsWorkflow: - @workflow.run - async def run( - self, - params: GroundTruthsWorkflowParams, - ): - start_day = datetime.strptime(params.start_day, "%Y-%m-%d").date() - end_day = datetime.strptime(params.end_day, "%Y-%m-%d").date() - - async with asyncio.TaskGroup() as tg: - for day in date_interval(start_day, end_day): - tg.create_task( - workflow.execute_activity( - make_ground_truths_in_day, - MakeGroundTruthsParams( - clickhouse=params.clickhouse, - data_dir=params.data_dir, - day=day.strftime("%Y-%m-%d"), - ), - start_to_close_timeout=MAKE_GROUND_TRUTHS_START_TO_CLOSE_TIMEOUT, - ) - ) diff --git a/oonipipeline/src/oonipipeline/temporal/workflows/observations.py b/oonipipeline/src/oonipipeline/temporal/workflows/observations.py index 9482f826..8394e18f 100644 --- a/oonipipeline/src/oonipipeline/temporal/workflows/observations.py +++ b/oonipipeline/src/oonipipeline/temporal/workflows/observations.py @@ -20,16 +20,14 @@ from oonipipeline.temporal.workflows.common import ( get_workflow_start_time, ) - + from oonipipeline.settings import config @dataclass class ObservationsWorkflowParams: probe_cc: List[str] test_name: List[str] - clickhouse: str - data_dir: str fast_fail: bool - is_reprocessing: bool = True + is_reprocessing: bool = False bucket_date: Optional[str] = None @@ -46,8 +44,8 @@ async def run(self, params: ObservationsWorkflowParams) -> dict: params_make_observations = MakeObservationsParams( probe_cc=params.probe_cc, test_name=params.test_name, - clickhouse=params.clickhouse, - data_dir=params.data_dir, + clickhouse=config.clickhouse_url, + data_dir=config.data_dir, fast_fail=params.fast_fail, bucket_date=params.bucket_date, ) @@ -70,7 +68,7 @@ async def run(self, params: ObservationsWorkflowParams) -> dict: await workflow.execute_activity( optimize_tables, OptimizeTablesParams( - clickhouse=params.clickhouse, + clickhouse=config.clickhouse_url, table_names=["obs_web", "obs_web_ctrl", "obs_http_middlebox"], partition_str=partition_str, ), diff --git a/oonipipeline/src/oonipipeline/transforms/measurement_transformer.py b/oonipipeline/src/oonipipeline/transforms/measurement_transformer.py index d23b0ad6..11243f85 100644 --- a/oonipipeline/src/oonipipeline/transforms/measurement_transformer.py +++ b/oonipipeline/src/oonipipeline/transforms/measurement_transformer.py @@ -427,7 +427,7 @@ def measurement_to_tls_observation( try: tlso.certificate_chain_fingerprints = list( map(lambda d: hashlib.sha256(d).hexdigest(), tlso.peer_certificates) - ) + )[1:] except Exception: log.warning("failed to decode peer_certificates") diff --git a/oonipipeline/src/oonipipeline/transforms/observations.py b/oonipipeline/src/oonipipeline/transforms/observations.py index 5002c830..5cd728e7 100644 --- a/oonipipeline/src/oonipipeline/transforms/observations.py +++ b/oonipipeline/src/oonipipeline/transforms/observations.py @@ -66,7 +66,7 @@ def measurement_to_observations( msmt: Union[HTTPHeaderFieldManipulation, HTTPInvalidRequestLine], netinfodb: NetinfoDB, - bucket_date: str, + bucket_date: str = "1984-01-01", ) -> TypeHTTPMiddleboxObservations: ... @@ -74,7 +74,7 @@ def measurement_to_observations( def measurement_to_observations( msmt: WebConnectivity, netinfodb: NetinfoDB, - bucket_date: str, + bucket_date: str = "1984-01-01", ) -> TypeWebConnectivityObservations: ... @@ -84,7 +84,7 @@ def measurement_to_observations( Signal, Whatsapp, Telegram, StunReachability, Tor, FacebookMessenger, UrlGetter ], netinfodb: NetinfoDB, - bucket_date: str, + bucket_date: str = "1984-01-01", ) -> TypeWebObservations: ... @@ -92,7 +92,7 @@ def measurement_to_observations( def measurement_to_observations( msmt: SupportedDataformats, netinfodb: NetinfoDB, - bucket_date: str, + bucket_date: str = "1984-01-01", ) -> TypeWebObservations: ... diff --git a/oonipipeline/tests/conftest.py b/oonipipeline/tests/conftest.py index 0c107935..53b54091 100644 --- a/oonipipeline/tests/conftest.py +++ b/oonipipeline/tests/conftest.py @@ -23,6 +23,7 @@ from oonipipeline.db.create_tables import make_create_queries from oonipipeline.fingerprintdb import FingerprintDB from oonipipeline.netinfo import NetinfoDB +from oonipipeline.settings import config from ._fixtures import SAMPLE_MEASUREMENTS @@ -77,6 +78,7 @@ def temporal_dev_server(request): @pytest.fixture def datadir(): + config.data_dir = str(DATA_DIR) return DATA_DIR @@ -157,5 +159,10 @@ def db_notruncate(clickhouse_server): def db(clickhouse_server): db = create_db_for_fixture(clickhouse_server) for _, table_name in make_create_queries(): + # Ignore the fingerprints_dns table, since it's a remote table + if table_name == "fingerprints_dns": + continue db.execute(f"TRUNCATE TABLE {table_name};") + + config.clickhouse_url = db.clickhouse_url yield db diff --git a/oonipipeline/tests/test_analysis.py b/oonipipeline/tests/test_analysis.py index 5535be4e..52cc7099 100644 --- a/oonipipeline/tests/test_analysis.py +++ b/oonipipeline/tests/test_analysis.py @@ -1,15 +1,17 @@ from base64 import b64decode -from datetime import datetime +from datetime import datetime, timedelta from pprint import pprint import random from typing import List, Tuple from unittest.mock import MagicMock +from oonipipeline.analysis.web_analysis import ( + get_analysis_web_fuzzy_logic, +) +from oonipipeline.temporal.activities.observations import write_observations_to_db import pytest from oonidata.dataclient import load_measurement -from oonidata.models.analysis import WebAnalysis -from oonidata.models.experiment_result import MeasurementExperimentResult from oonidata.models.nettests.signal import Signal from oonidata.models.nettests.web_connectivity import WebConnectivity from oonidata.models.observations import ( @@ -18,443 +20,287 @@ ) from oonidata.datautils import validate_cert_chain -from oonipipeline.analysis.web_analysis import make_web_analysis -from oonipipeline.analysis.control import ( - BodyDB, - WebGroundTruth, - iter_ground_truths_from_web_control, - WebGroundTruthDB, -) from oonipipeline.transforms.nettests.signal import SIGNAL_PEM_STORE from oonipipeline.transforms.observations import measurement_to_observations -from oonipipeline.analysis.signal import make_signal_experiment_result -from oonipipeline.analysis.website_experiment_results import ( - make_website_experiment_results, -) - - -def test_signal(fingerprintdb, netinfodb, measurements): - signal_old_ca = load_measurement( - msmt_path=measurements["20221016235944.266268_GB_signal_1265ff650ee17b44"] - ) - assert isinstance(signal_old_ca, Signal) - assert signal_old_ca.test_keys.tls_handshakes - - for tls_handshake in signal_old_ca.test_keys.tls_handshakes: - assert tls_handshake.peer_certificates - assert tls_handshake.server_name - certificate_chain = list( - map(lambda c: b64decode(c.data), tls_handshake.peer_certificates) - ) - validate_cert_chain( - datetime(2021, 10, 16), - certificate_chain=certificate_chain, - pem_cert_store=SIGNAL_PEM_STORE, - ) - - signal_new_ca = load_measurement( - msmt_path=measurements["20221020235950.432819_NL_signal_27b05458f186a906"] - ) - assert isinstance(signal_new_ca, Signal) - assert signal_new_ca.test_keys.tls_handshakes - - for tls_handshake in signal_new_ca.test_keys.tls_handshakes: - assert tls_handshake.peer_certificates - assert tls_handshake.server_name - certificate_chain = list( - map(lambda c: b64decode(c.data), tls_handshake.peer_certificates) - ) - validate_cert_chain( - datetime(2022, 10, 20), - certificate_chain=certificate_chain, - pem_cert_store=SIGNAL_PEM_STORE, - ) - - web_observations = measurement_to_observations(signal_new_ca, netinfodb=netinfodb)[ - 0 - ] - er = list( - make_signal_experiment_result( - web_observations=web_observations, - fingerprintdb=fingerprintdb, - ) - ) - assert er[0].anomaly == False - assert er[0].confirmed == False - - signal_blocked_uz = load_measurement( - msmt_path=measurements["20210926222047.205897_UZ_signal_95fab4a2e669573f"] - ) - assert isinstance(signal_blocked_uz, Signal) - web_observations = measurement_to_observations( - signal_blocked_uz, netinfodb=netinfodb - )[0] - blocking_event = list( - make_signal_experiment_result( - web_observations=web_observations, - fingerprintdb=fingerprintdb, - ) - ) - assert blocking_event[0].anomaly == True - assert blocking_event[0].confirmed == False - tls_be = list( - filter( - lambda be: be.outcome_category == "tls", - blocking_event, - ) - ) - assert len(tls_be) > 0 - - signal_blocked_ir = load_measurement( - msmt_path=measurements["20221018174612.488229_IR_signal_f8640b28061bec06"] - ) - assert isinstance(signal_blocked_ir, Signal) - web_observations = measurement_to_observations( - signal_blocked_ir, netinfodb=netinfodb - )[0] - blocking_event = list( - make_signal_experiment_result( - web_observations=web_observations, - fingerprintdb=fingerprintdb, - ) - ) - assert blocking_event[0].anomaly == True - dns_outcomes = list( - filter( - lambda be: be.outcome_category == "dns", - blocking_event, - ) - ) - assert len(dns_outcomes) > 0 - assert blocking_event[0].confirmed == True - - -def make_experiment_result_from_wc_ctrl(msmt_path, fingerprintdb, netinfodb): - msmt = load_measurement(msmt_path=msmt_path) - assert isinstance(msmt, WebConnectivity) - _, web_control_observations = measurement_to_observations(msmt, netinfodb=netinfodb) - - assert msmt.test_keys.control - assert isinstance(msmt.input, str) - web_ground_truth_db = WebGroundTruthDB() - web_ground_truth_db.build_from_rows( - rows=iter_ground_truths_from_web_control( - web_control_observations=web_control_observations, - netinfodb=netinfodb, - ), - ) - - body_db = MagicMock() - body_db.lookup = MagicMock() - body_db.lookup.return_value = [] - - return [] - - -def make_web_er_from_msmt(msmt, fingerprintdb, netinfodb) -> Tuple[ - List[MeasurementExperimentResult], - List[WebAnalysis], - List[WebObservation], - List[WebControlObservation], -]: - assert isinstance(msmt, WebConnectivity) - web_observations, web_control_observations = measurement_to_observations( - msmt, netinfodb=netinfodb - ) - assert isinstance(msmt.input, str) - web_ground_truth_db = WebGroundTruthDB() - web_ground_truth_db.build_from_rows( - rows=iter_ground_truths_from_web_control( - web_control_observations=web_control_observations, - netinfodb=netinfodb, - ), - ) - - web_ground_truths = web_ground_truth_db.lookup_by_web_obs(web_obs=web_observations) - web_analysis = list( - make_web_analysis( - web_observations=web_observations, - web_ground_truths=web_ground_truths, - body_db=BodyDB(db=None), # type: ignore - fingerprintdb=fingerprintdb, - ) - ) - - return ( - list(make_website_experiment_results(web_analysis)), - web_analysis, - web_observations, - web_control_observations, - ) - - -def test_website_web_analysis_blocked(fingerprintdb, netinfodb, measurements, datadir): - msmt = load_measurement( - msmt_path=measurements[ - "20221110235922.335062_IR_webconnectivity_e4114ee32b8dbf74" - ], - ) - er, web_analysis, web_obs, web_ctrl_obs = make_web_er_from_msmt( - msmt, fingerprintdb=fingerprintdb, netinfodb=netinfodb - ) - assert len(web_analysis) == len(web_obs) - assert len(web_ctrl_obs) == 5 - - assert len(er) == 1 - assert er[0].loni_blocked_values == [1.0] - assert er[0].loni_ok_value == 0 - assert er[0].loni_blocked_keys[0].startswith("dns.") - - -def test_website_web_analysis_plaintext_ok(fingerprintdb, netinfodb, measurements): - msmt = load_measurement( - msmt_path=measurements[ - "20220608132401.787399_AM_webconnectivity_2285fc373f62729e" - ], - ) - er, web_analysis, web_obs, web_ctrl_obs = make_web_er_from_msmt( - msmt, fingerprintdb=fingerprintdb, netinfodb=netinfodb - ) - assert len(web_analysis) == len(web_obs) - assert len(web_ctrl_obs) == 2 - - assert len(er) == 1 - ok_dict = dict(zip(er[0].loni_ok_keys, er[0].loni_ok_values)) - assert ok_dict["dns"] > 0.8 - assert ok_dict["tcp"] > 0.8 - assert ok_dict["tls"] > 0.8 - assert ok_dict["http"] > 0.8 - - assert er[0].loni_ok_value > 0.8 - - -def test_website_web_analysis_blocked_2(fingerprintdb, netinfodb, measurements): - msmt = load_measurement( - msmt_path=measurements[ - "20220627030703.592775_IR_webconnectivity_80e199b3c572f8d3" - ], - ) - er, web_analysis, web_obs, web_ctrl_obs = make_web_er_from_msmt( - msmt, fingerprintdb=fingerprintdb, netinfodb=netinfodb - ) - assert len(web_analysis) == len(web_obs) - assert len(web_ctrl_obs) == 6 - - assert len(er) == 1 - assert er[0].loni_blocked_values == [1.0] - assert er[0].loni_ok_value == 0 - assert er[0].loni_blocked_keys[0].startswith("dns.") - - -def test_website_dns_blocking_event(fingerprintdb, netinfodb, measurements): - msmt_path = measurements[ - "20220627134426.194308_DE_webconnectivity_15675b61ec62e268" - ] - msmt = load_measurement( - msmt_path=msmt_path, - ) - er, web_analysis, web_obs, web_ctrl_obs = make_web_er_from_msmt( - msmt, fingerprintdb=fingerprintdb, netinfodb=netinfodb - ) - assert len(web_analysis) == len(web_obs) - assert len(web_ctrl_obs) == 6 - - assert len(er) == 1 - assert er[0].loni_ok_value == 0 - assert er[0].loni_blocked_values[0] > 0.7 - assert er[0].loni_blocked_keys[0].startswith("dns.") - - -def test_website_dns_blocking_event_2(fingerprintdb, netinfodb, measurements): - msmt_path = measurements[ - "20220627125833.737451_FR_webconnectivity_bca9ad9d3371919a" - ] - msmt = load_measurement( - msmt_path=msmt_path, - ) - er, web_analysis, web_obs, web_ctrl_obs = make_web_er_from_msmt( - msmt, fingerprintdb=fingerprintdb, netinfodb=netinfodb - ) - assert len(web_analysis) == len(web_obs) - assert len(web_ctrl_obs) == 5 - - assert len(er) == 1 - assert er[0].loni_ok_value == 0 - assert er[0].loni_blocked_values[0] > 0.5 - assert er[0].loni_blocked_keys[0].startswith("dns.") - -def test_website_dns_ok(fingerprintdb, netinfodb, measurements): - msmt_path = measurements[ - "20220625234824.235023_HU_webconnectivity_3435a5df0e743d39" - ] - msmt = load_measurement( - msmt_path=msmt_path, - ) - er, web_analysis, web_obs, web_ctrl_obs = make_web_er_from_msmt( - msmt, fingerprintdb=fingerprintdb, netinfodb=netinfodb - ) - # assert len(web_analysis) == len(web_obs) - assert len(web_ctrl_obs) == 5 - - assert len(er) == 1 - assert er[0].loni_ok_value == 1 - - -# Check this for wc 0.5 overwriting tls analsysis -# 20231031000227.813597_MY_webconnectivity_2f0b80761373aa7e -def test_website_experiment_results(measurements, netinfodb, fingerprintdb): - msmt = load_measurement( - msmt_path=measurements[ - "20221101055235.141387_RU_webconnectivity_046ce024dd76b564" - ] - ) - er, web_analysis, web_obs, web_ctrl_obs = make_web_er_from_msmt( - msmt, fingerprintdb=fingerprintdb, netinfodb=netinfodb - ) - assert len(web_analysis) == len(web_obs) - assert len(web_ctrl_obs) == 3 - - assert len(er) == 1 - assert er[0].loni_ok_value < 0.2 - ok_dict = dict(zip(er[0].loni_ok_keys, er[0].loni_ok_values)) - assert ok_dict["tcp"] == 0 - - blocked_dict = dict(zip(er[0].loni_blocked_keys, er[0].loni_blocked_values)) - assert blocked_dict["tcp.timeout"] > 0.4 - - -def test_website_web_analysis_down(measurements, netinfodb, fingerprintdb): - msmt = load_measurement( - msmt_path=measurements[ - "20240420235427.477327_US_webconnectivity_9b3cac038dc2ba22" - ] - ) - er, web_analysis, web_obs, web_ctrl_obs = make_web_er_from_msmt( - msmt, fingerprintdb=fingerprintdb, netinfodb=netinfodb - ) - assert len(web_analysis) == len(web_obs) - assert len(web_ctrl_obs) == 3 - - assert len(er) == 1 - assert er[0].loni_ok_value < 0.2 - ok_dict = dict(zip(er[0].loni_ok_keys, er[0].loni_ok_values)) - assert ok_dict["tcp"] == 0 - - down_dict = dict(zip(er[0].loni_down_keys, er[0].loni_down_values)) - - blocked_dict = dict(zip(er[0].loni_blocked_keys, er[0].loni_blocked_values)) - - assert sum(down_dict.values()) > sum(blocked_dict.values()) - assert down_dict["tcp.timeout"] > 0.5 - - -def test_website_web_analysis_blocked_connect_reset( - measurements, netinfodb, fingerprintdb +# @pytest.mark.skip(reason="TODO(art): fixme") +# def test_signal(fingerprintdb, netinfodb, measurements): +# signal_old_ca = load_measurement( +# msmt_path=measurements["20221016235944.266268_GB_signal_1265ff650ee17b44"] +# ) +# assert isinstance(signal_old_ca, Signal) +# assert signal_old_ca.test_keys.tls_handshakes + +# for tls_handshake in signal_old_ca.test_keys.tls_handshakes: +# assert tls_handshake.peer_certificates +# assert tls_handshake.server_name +# certificate_chain = list( +# map(lambda c: b64decode(c.data), tls_handshake.peer_certificates) +# ) +# validate_cert_chain( +# datetime(2021, 10, 16), +# certificate_chain=certificate_chain, +# pem_cert_store=SIGNAL_PEM_STORE, +# ) + +# signal_new_ca = load_measurement( +# msmt_path=measurements["20221020235950.432819_NL_signal_27b05458f186a906"] +# ) +# assert isinstance(signal_new_ca, Signal) +# assert signal_new_ca.test_keys.tls_handshakes + +# for tls_handshake in signal_new_ca.test_keys.tls_handshakes: +# assert tls_handshake.peer_certificates +# assert tls_handshake.server_name +# certificate_chain = list( +# map(lambda c: b64decode(c.data), tls_handshake.peer_certificates) +# ) +# validate_cert_chain( +# datetime(2022, 10, 20), +# certificate_chain=certificate_chain, +# pem_cert_store=SIGNAL_PEM_STORE, +# ) + +# web_observations = measurement_to_observations(signal_new_ca, netinfodb=netinfodb)[ +# 0 +# ] +# er = list( +# make_signal_experiment_result( +# web_observations=web_observations, +# fingerprintdb=fingerprintdb, +# ) +# ) +# assert er[0].anomaly == False +# assert er[0].confirmed == False + +# signal_blocked_uz = load_measurement( +# msmt_path=measurements["20210926222047.205897_UZ_signal_95fab4a2e669573f"] +# ) +# assert isinstance(signal_blocked_uz, Signal) +# web_observations = measurement_to_observations( +# signal_blocked_uz, netinfodb=netinfodb +# )[0] +# blocking_event = list( +# make_signal_experiment_result( +# web_observations=web_observations, +# fingerprintdb=fingerprintdb, +# ) +# ) +# assert blocking_event[0].anomaly == True +# assert blocking_event[0].confirmed == False +# tls_be = list( +# filter( +# lambda be: be.outcome_category == "tls", +# blocking_event, +# ) +# ) +# assert len(tls_be) > 0 + +# signal_blocked_ir = load_measurement( +# msmt_path=measurements["20221018174612.488229_IR_signal_f8640b28061bec06"] +# ) +# assert isinstance(signal_blocked_ir, Signal) +# web_observations = measurement_to_observations( +# signal_blocked_ir, netinfodb=netinfodb +# )[0] +# blocking_event = list( +# make_signal_experiment_result( +# web_observations=web_observations, +# fingerprintdb=fingerprintdb, +# ) +# ) +# assert blocking_event[0].anomaly == True +# dns_outcomes = list( +# filter( +# lambda be: be.outcome_category == "dns", +# blocking_event, +# ) +# ) +# assert len(dns_outcomes) > 0 +# assert blocking_event[0].confirmed == True + + +def perform_analysis( + db, + netinfodb, + measurements, + measurement_uid: str, ): - msmt_path = measurements[ - "20240302000048.790188_RU_webconnectivity_e7ffd3bc0f525eb7" - ] - msmt = load_measurement(msmt_path=msmt_path) - er, web_analysis, web_obs, web_ctrl_obs = make_web_er_from_msmt( - msmt, fingerprintdb=fingerprintdb, netinfodb=netinfodb - ) - # assert len(web_analysis) == len(web_obs) - assert len(web_ctrl_obs) == 4 - - assert len(er) == 1 - # TODO(art): this should be changed - # assert er[0].loni_ok_value == 0 - assert er[0].loni_ok_value < 0.2 - - ok_dict = dict(zip(er[0].loni_ok_keys, er[0].loni_ok_values)) - assert ok_dict["tls"] == 0 - - down_dict = dict(zip(er[0].loni_down_keys, er[0].loni_down_values)) - blocked_dict = dict(zip(er[0].loni_blocked_keys, er[0].loni_blocked_values)) - - assert sum(down_dict.values()) < sum(blocked_dict.values()) - assert blocked_dict["tls.connection_reset"] > 0.5 - - -def print_debug_er(er): - for idx, e in enumerate(er): - print(f"\n# ER#{idx}") - for idx, transcript in enumerate(e.analysis_transcript_list): - print(f"## Analysis #{idx}") - print("\n".join(transcript)) - pprint(er) - - -def test_website_web_analysis_nxdomain_down(measurements, netinfodb, fingerprintdb): - msmt_path = measurements[ - "20240302000050.000654_SN_webconnectivity_fe4221088fbdcb0a" - ] - msmt = load_measurement(msmt_path=msmt_path) - er, web_analysis, web_obs, web_ctrl_obs = make_web_er_from_msmt( - msmt, fingerprintdb=fingerprintdb, netinfodb=netinfodb - ) - assert len(web_analysis) == len(web_obs) - assert len(web_ctrl_obs) == 2 - - assert len(er) == 1 - assert er[0].loni_ok_value < 0.2 - - ok_dict = dict(zip(er[0].loni_ok_keys, er[0].loni_ok_values)) - assert ok_dict["dns"] == 0 - - down_dict = dict(zip(er[0].loni_down_keys, er[0].loni_down_values)) - blocked_dict = dict(zip(er[0].loni_blocked_keys, er[0].loni_blocked_values)) - - assert sum(down_dict.values()) > sum(blocked_dict.values()) - assert down_dict["dns.nxdomain"] > 0.7 - - -def test_website_web_analysis_nxdomain_blocked(measurements, netinfodb, fingerprintdb): - msmt_path = measurements[ - "20240302000305.316064_EG_webconnectivity_397bca9091b07444" - ] - msmt = load_measurement(msmt_path=msmt_path) - er, web_analysis, web_obs, web_ctrl_obs = make_web_er_from_msmt( - msmt, fingerprintdb=fingerprintdb, netinfodb=netinfodb - ) - assert len(web_analysis) == len(web_obs) - assert len(web_ctrl_obs) == 7 - - assert len(er) == 1 - assert er[0].loni_ok_value < 0.2 - - ok_dict = dict(zip(er[0].loni_ok_keys, er[0].loni_ok_values)) - assert ok_dict["dns"] == 0 - - down_dict = dict(zip(er[0].loni_down_keys, er[0].loni_down_values)) - blocked_dict = dict(zip(er[0].loni_blocked_keys, er[0].loni_blocked_values)) - - assert sum(down_dict.values()) < sum(blocked_dict.values()) - assert blocked_dict["dns.nxdomain"] > 0.7 - - -def test_website_web_analysis_blocked_inconsistent_country( - measurements, netinfodb, fingerprintdb -): - msmt_path = measurements[ - "20240309112858.009725_SE_webconnectivity_dce757ef4ec9b6c8" - ] - msmt = load_measurement(msmt_path=msmt_path) - er, web_analysis, web_obs, web_ctrl_obs = make_web_er_from_msmt( - msmt, fingerprintdb=fingerprintdb, netinfodb=netinfodb + msmt = load_measurement(msmt_path=measurements[measurement_uid]) + ts = datetime.strptime(msmt.measurement_start_time, "%Y-%m-%d %H:%M:%S") + write_observations_to_db( + db=db, + netinfodb=netinfodb, + msmt=msmt, + bucket_date="1984-01-01", + ) + db.flush() + analysis_list = list( + get_analysis_web_fuzzy_logic( + db=db, + start_time=ts - timedelta(days=1), + end_time=ts + timedelta(days=1), + probe_cc=[], + measurement_uid=measurement_uid, + ) ) - assert len(web_analysis) == len(web_obs) - assert len(web_ctrl_obs) == 3 - - assert len(er) == 1 - assert er[0].loni_ok_value < 0.2 - - ok_dict = dict(zip(er[0].loni_ok_keys, er[0].loni_ok_values)) - assert ok_dict["dns"] == 0 - - down_dict = dict(zip(er[0].loni_down_keys, er[0].loni_down_values)) - blocked_dict = dict(zip(er[0].loni_blocked_keys, er[0].loni_blocked_values)) - - assert sum(down_dict.values()) > sum(blocked_dict.values()) + assert len(analysis_list) == 1 + return analysis_list[0] + + +def test_website_web_analysis_blocked(db, netinfodb, measurements): + measurement_uid = "20221110235922.335062_IR_webconnectivity_e4114ee32b8dbf74" + analysis = perform_analysis( + db=db, + netinfodb=netinfodb, + measurements=measurements, + measurement_uid=measurement_uid, + ) + assert analysis["dns_blocked_max"] > 0.9 + + +def test_website_web_analysis_plaintext_ok(db, netinfodb, measurements): + measurement_uid = "20220608132401.787399_AM_webconnectivity_2285fc373f62729e" + analysis = perform_analysis( + db=db, + netinfodb=netinfodb, + measurements=measurements, + measurement_uid=measurement_uid, + ) + assert analysis["dns_blocked_max"] < 0.2 + assert analysis["tcp_blocked_max"] < 0.2 + assert analysis["tls_blocked_max"] < 0.2 + # assert analysis["http_blocked_max"] < 0.5 + assert analysis["dns_ok_max"] > 0.8 + assert analysis["tcp_ok_max"] > 0.8 + + +def test_website_web_analysis_blocked_2(db, netinfodb, measurements): + measurement_uid = "20220627030703.592775_IR_webconnectivity_80e199b3c572f8d3" + analysis = perform_analysis( + db=db, + netinfodb=netinfodb, + measurements=measurements, + measurement_uid=measurement_uid, + ) + assert analysis["dns_blocked_max"] > 0.8 + assert analysis["dns_ok_max"] < 0.2 + + +def test_website_dns_blocking_event(db, netinfodb, measurements): + measurement_uid = "20220627134426.194308_DE_webconnectivity_15675b61ec62e268" + analysis = perform_analysis( + db=db, + netinfodb=netinfodb, + measurements=measurements, + measurement_uid=measurement_uid, + ) + assert analysis["dns_blocked_max"] > 0.8 + assert analysis["dns_ok_max"] < 0.2 + + +def test_website_dns_blocking_event_2(db, netinfodb, measurements): + measurement_uid = "20220627125833.737451_FR_webconnectivity_bca9ad9d3371919a" + analysis = perform_analysis( + db=db, + netinfodb=netinfodb, + measurements=measurements, + measurement_uid=measurement_uid, + ) + assert analysis["dns_blocked_max"] > 0.6 + assert analysis["dns_ok_max"] < 0.4 + + +def test_website_dns_ok(db, netinfodb, measurements): + measurement_uid = "20220625234824.235023_HU_webconnectivity_3435a5df0e743d39" + analysis = perform_analysis( + db=db, + netinfodb=netinfodb, + measurements=measurements, + measurement_uid=measurement_uid, + ) + assert analysis["dns_ok_max"] == 1.0 + assert analysis["tcp_ok_max"] == 1.0 + assert analysis["tls_ok_max"] == 1.0 + + +# # Check this for wc 0.5 overwriting tls analsysis +# # 20231031000227.813597_MY_webconnectivity_2f0b80761373aa7e +def test_website_experiment_results(measurements, netinfodb, db): + measurement_uid = "20221101055235.141387_RU_webconnectivity_046ce024dd76b564" + analysis = perform_analysis( + db=db, + netinfodb=netinfodb, + measurements=measurements, + measurement_uid=measurement_uid, + ) + assert analysis["dns_blocked_max"] < 0.5 + assert analysis["tcp_blocked_max"] > 0.6 + assert analysis["top_tcp_failure"] == "generic_timeout_error" + assert analysis["tls_blocked_max"] == 0.0 + + +def test_website_web_analysis_down(measurements, netinfodb, db): + measurement_uid = "20240420235427.477327_US_webconnectivity_9b3cac038dc2ba22" + analysis = perform_analysis( + db=db, + netinfodb=netinfodb, + measurements=measurements, + measurement_uid=measurement_uid, + ) + assert analysis["dns_blocked_max"] < 0.5 + assert analysis["tcp_down_max"] > 0.6 + assert analysis["top_tcp_failure"] == "generic_timeout_error" + assert analysis["tls_blocked_max"] == 0.0 + + +def test_website_web_analysis_blocked_connect_reset(measurements, netinfodb, db): + measurement_uid = "20240302000048.790188_RU_webconnectivity_e7ffd3bc0f525eb7" + analysis = perform_analysis( + db=db, + netinfodb=netinfodb, + measurements=measurements, + measurement_uid=measurement_uid, + ) + assert analysis["dns_blocked_max"] < 0.5 + assert analysis["tcp_blocked_max"] < 0.5 + assert analysis["tls_blocked_max"] > 0.7 + assert analysis["top_tls_failure"] == "connection_reset" + + +def test_website_web_analysis_nxdomain_down(measurements, netinfodb, db): + measurement_uid = "20240302000050.000654_SN_webconnectivity_fe4221088fbdcb0a" + analysis = perform_analysis( + db=db, + netinfodb=netinfodb, + measurements=measurements, + measurement_uid=measurement_uid, + ) + assert analysis["dns_down_max"] > 0.6 + assert analysis["top_dns_failure"] == "dns_nxdomain_error" + + +def test_website_web_analysis_nxdomain_blocked(measurements, netinfodb, db): + measurement_uid = "20240302000305.316064_EG_webconnectivity_397bca9091b07444" + analysis = perform_analysis( + db=db, + netinfodb=netinfodb, + measurements=measurements, + measurement_uid=measurement_uid, + ) + assert analysis["dns_blocked_max"] > 0.6 + assert analysis["top_dns_failure"] == "dns_nxdomain_error" + + +def test_website_web_analysis_blocked_inconsistent_country(measurements, netinfodb, db): + measurement_uid = "20240309112858.009725_SE_webconnectivity_dce757ef4ec9b6c8" + analysis = perform_analysis( + db=db, + netinfodb=netinfodb, + measurements=measurements, + measurement_uid=measurement_uid, + ) + assert analysis["dns_ok_max"] < 0.3 + assert analysis["dns_blocked_max"] > 0.5 + assert analysis["top_dns_failure"] == None diff --git a/oonipipeline/tests/test_ctrl.py b/oonipipeline/tests/test_ctrl.py deleted file mode 100644 index d908a94e..00000000 --- a/oonipipeline/tests/test_ctrl.py +++ /dev/null @@ -1,241 +0,0 @@ -from datetime import date, datetime, timezone -import time - -from oonidata.models.observations import MeasurementMeta, ProbeMeta, WebObservation - -from oonipipeline.analysis.datasources import iter_web_observations -from oonipipeline.db.connections import ClickhouseConnection -from oonipipeline.analysis.control import ( - WebGroundTruthDB, - iter_web_ground_truths, -) -from oonipipeline.temporal.activities.observations import ( - MakeObservationsFileEntryBatch, - make_observations_for_file_entry_batch, -) - -DUMMY_PROBE_META = ProbeMeta( - probe_asn=6167, - probe_cc="US", - probe_as_org_name="Verizon Business", - probe_as_cc="US", - probe_as_name="20211102", - network_type="TEST", - platform="TEST", - origin="", - engine_name="TEST", - engine_version="TEST", - architecture="TEST", - resolver_ip="141.207.147.254", - resolver_asn=22394, - resolver_cc="US", - resolver_as_org_name="Verizon Business", - resolver_as_cc="US", - resolver_is_scrubbed=False, - resolver_asn_probe=22394, - resolver_as_org_name_probe="Verizon Business", -) - -DUMMY_MEASUREMENT_META = MeasurementMeta( - software_name="TEST", - software_version="TEST", - bucket_date="2023-10-31", - test_name="web_connectivity", - test_version="0.4.2", - measurement_uid="TEST", - ooni_run_link_id="1234", - input=None, - report_id="TEST", - measurement_start_time=datetime(2023, 10, 31, 15, 56, 12), -) - - -def test_web_ground_truth_from_clickhouse(db, datadir, netinfodb, tmp_path): - file_entry_batch = [ - ( - "ooni-data-eu-fra", - "raw/20231031/15/US/webconnectivity/2023103115_US_webconnectivity.n1.7.tar.gz", - "tar.gz", - 5798373, - ) - ] - obs_msmt_count = make_observations_for_file_entry_batch( - file_entry_batch=file_entry_batch, - clickhouse=db.clickhouse_url, - write_batch_size=1, - data_dir=datadir, - bucket_date="2023-10-31", - probe_cc=["US"], - fast_fail=False, - ) - assert obs_msmt_count == 299 - # Wait for buffers to flush - time.sleep(3) - ground_truth_db_path = tmp_path / "test-groundtruthdbUSONLY-2023-10-31.sqlite3" - web_ground_truth_db = WebGroundTruthDB( - connect_str=str(ground_truth_db_path.absolute()) - ) - web_ground_truth_db.build_from_rows( - rows=iter_web_ground_truths( - db=db, - measurement_day=date(2023, 10, 31), - netinfodb=netinfodb, - ) - ) - - wgt_db = WebGroundTruthDB() - wgt_db.build_from_existing(str(ground_truth_db_path.absolute())) - - web_obs = [ - WebObservation( - probe_meta=DUMMY_PROBE_META, - measurement_meta=DUMMY_MEASUREMENT_META, - # The only things we look at to find the groundtruth are hostname, ip, http_request_url - hostname="explorer.ooni.org", - ip="37.218.242.149", - port=443, - http_request_url="https://explorer.ooni.org/", - created_at=datetime(2023, 11, 17, 10, 35, 34), - observation_idx=1, - target_id=None, - transaction_id=None, - ip_asn=54113, - ip_as_org_name="Fastly, Inc.", - ip_as_cc="US", - ip_cc="US", - ip_is_bogon=False, - dns_query_type="A", - dns_failure=None, - dns_engine="system", - dns_engine_resolver_address="", - dns_answer_type="A", - dns_answer="151.101.65.195", - dns_answer_asn=54113, - dns_answer_as_org_name="Fastly, Inc.", - dns_t=0.117683385, - tcp_failure=None, - tcp_success=True, - tcp_t=0.583859739, - tls_failure=None, - tls_server_name=None, - tls_version=None, - tls_cipher_suite=None, - tls_is_certificate_valid=True, - tls_end_entity_certificate_fingerprint=None, - tls_end_entity_certificate_subject=None, - tls_end_entity_certificate_subject_common_name=None, - tls_end_entity_certificate_issuer=None, - tls_end_entity_certificate_issuer_common_name=None, - tls_end_entity_certificate_san_list=[], - tls_end_entity_certificate_not_valid_after=None, - tls_end_entity_certificate_not_valid_before=None, - tls_certificate_chain_length=3, - tls_certificate_chain_fingerprints=[], - tls_handshake_read_count=2, - tls_handshake_write_count=4, - tls_handshake_read_bytes=7201.0, - tls_handshake_write_bytes=392.0, - tls_handshake_last_operation="write_4", - tls_handshake_time=0.07061901100000001, - tls_t=0.654237447, - http_network=None, - http_alpn=None, - http_failure=None, - http_request_body_length=None, - http_request_method=None, - http_runtime=None, - http_response_body_length=None, - http_response_body_is_truncated=None, - http_response_body_sha1=None, - http_response_status_code=None, - http_response_header_location=None, - http_response_header_server=None, - http_request_redirect_from=None, - http_request_body_is_truncated=None, - http_t=None, - probe_analysis="false", - ) - ] - - relevant_gts = web_ground_truth_db.lookup_by_web_obs(web_obs=web_obs) - assert len(relevant_gts) == 2 - for gt in relevant_gts: - if gt.ip: - assert gt.ip_asn == 47172 - assert gt.ip_as_org_name - assert "greenhost" in gt.ip_as_org_name.lower() - - # for gt in relevant_gts: - # print_nice_vertical(gt) - - -def test_web_ground_truth_db(): - base_wgt = dict( - vp_asn=0, - vp_cc="ZZ", - is_trusted_vp=True, - timestamp=datetime.now(), - hostname=None, - ip=None, - ip_asn=100, - ip_as_org_name="fake", - port=80, - dns_failure=None, - dns_success=True, - tcp_failure="", - tcp_success=True, - tls_failure="", - tls_success=True, - tls_is_certificate_valid=True, - http_request_url=None, - http_failure="", - http_success=True, - http_response_body_length=42, - count=1, - ) - all_wgt = [] - for _ in range(10): - wgt_dict = base_wgt.copy() - wgt_dict["ip"] = "1.1.1.1" - wgt_dict["port"] = 80 - all_wgt.append(wgt_dict) - - for _ in range(10): - wgt_dict = base_wgt.copy() - wgt_dict["hostname"] = "ooni.org" - all_wgt.append(wgt_dict) - - for _ in range(10): - wgt_dict = base_wgt.copy() - wgt_dict["http_request_url"] = "https://ooni.org/" - all_wgt.append(wgt_dict) - - iter_rows = map(lambda x: (list(x.keys()), list(x.values())), all_wgt) - - wgt_db = WebGroundTruthDB() - wgt_db.build_from_rows(rows=iter_rows) - res = wgt_db.lookup(probe_cc="IT", probe_asn=100, hostnames=["ooni.org"]) - # They should be aggregated - assert len(res) == 1 - assert res[0].count == 10 - - res = wgt_db.lookup(probe_cc="IT", probe_asn=100, ip_ports=[("1.1.1.1", 80)]) - assert len(res) == 1 - assert res[0].count == 10 - - res = wgt_db.lookup( - probe_cc="IT", probe_asn=100, http_request_urls=["https://ooni.org/"] - ) - assert len(res) == 1 - assert res[0].count == 10 - assert res[0].http_success - - res = wgt_db.lookup( - probe_cc="IT", - probe_asn=100, - http_request_urls=["https://ooni.org/"], - ip_ports=[("1.1.1.1", 80)], - hostnames=["ooni.org"], - ) - assert len(res) == 3 - assert all(r.count == 10 for r in res) diff --git a/oonipipeline/tests/test_temporal_e2e.py b/oonipipeline/tests/test_temporal_e2e.py index d425cf7a..15724319 100644 --- a/oonipipeline/tests/test_temporal_e2e.py +++ b/oonipipeline/tests/test_temporal_e2e.py @@ -28,8 +28,6 @@ async def test_scheduling(datadir, db): client=env.client, probe_cc=[], test_name=[], - clickhouse_url=db.clickhouse_url, - data_dir=str(datadir), ) assert sched_res.analysis assert sched_res.observations @@ -60,8 +58,6 @@ async def test_scheduling(datadir, db): client=env.client, probe_cc=[], test_name=[], - clickhouse_url=db.clickhouse_url, - data_dir=str(datadir), ) assert sched_res.analysis != sched_res2.analysis assert sched_res.observations != sched_res2.observations @@ -72,8 +68,6 @@ async def test_observation_workflow(datadir, db): obs_params = ObservationsWorkflowParams( probe_cc=["BA"], test_name=["web_connectivity"], - clickhouse=db.clickhouse_url, - data_dir=str(datadir.absolute()), fast_fail=False, bucket_date="2022-10-21", ) diff --git a/oonipipeline/tests/test_workflows.py b/oonipipeline/tests/test_workflows.py index ebc82443..494fe2d9 100644 --- a/oonipipeline/tests/test_workflows.py +++ b/oonipipeline/tests/test_workflows.py @@ -2,9 +2,10 @@ import gzip from pathlib import Path import sqlite3 -from typing import List, Tuple +from typing import Dict, List, Tuple from unittest.mock import MagicMock +from oonipipeline.db.connections import ClickhouseConnection from temporalio.testing import WorkflowEnvironment from temporalio.worker import Worker from temporalio import activity @@ -20,12 +21,8 @@ from oonipipeline.temporal.activities.common import ( ClickhouseParams, OptimizeTablesParams, - get_obs_count_by_cc, - ObsCountParams, ) from oonipipeline.temporal.activities.observations import ( - DeletePreviousRangeParams, - GetPreviousRangeParams, MakeObservationsParams, MakeObservationsResult, make_observations_for_file_entry_batch, @@ -35,18 +32,6 @@ from oonipipeline.temporal.activities.analysis import ( MakeAnalysisParams, make_analysis_in_a_day, - make_cc_batches, -) -from oonipipeline.temporal.common import ( - TS_FORMAT, - BatchParameters, - PrevRange, - get_prev_range, - maybe_delete_prev_range, -) -from oonipipeline.temporal.activities.ground_truths import ( - MakeGroundTruthsParams, - make_ground_truths_in_day, ) from oonipipeline.temporal.workflows.analysis import ( AnalysisWorkflowParams, @@ -58,100 +43,23 @@ ) from oonipipeline.temporal.workflows.common import TASK_QUEUE_NAME -from .utils import wait_for_mutations - -def test_get_prev_range(db): - db.execute("DROP TABLE IF EXISTS test_range") - db.execute( - """CREATE TABLE test_range ( - created_at DateTime64(3, 'UTC'), - bucket_date String, - test_name String, - probe_cc String - ) - ENGINE = MergeTree - ORDER BY (bucket_date, created_at) - """ - ) - bucket_date = "2000-01-01" - test_name = "web_connectivity" - probe_cc = "IT" - min_time = datetime(2000, 1, 1, 23, 42, 00) - rows = [(min_time, bucket_date, test_name, probe_cc)] - for i in range(200): - rows.append((min_time + timedelta(seconds=i), bucket_date, test_name, probe_cc)) - db.execute( - "INSERT INTO test_range (created_at, bucket_date, test_name, probe_cc) VALUES", - rows, - ) - prev_range = get_prev_range( - db, - "test_range", - test_name=[test_name], - bucket_date=bucket_date, - probe_cc=[probe_cc], - ) - assert prev_range.min_created_at and prev_range.max_created_at - assert prev_range.min_created_at == (min_time - timedelta(seconds=1)).strftime( - TS_FORMAT - ) - assert prev_range.max_created_at == (rows[-1][0] + timedelta(seconds=1)).strftime( - TS_FORMAT - ) - - db.execute("TRUNCATE TABLE test_range") - bucket_date = "2000-03-01" - test_name = "web_connectivity" - probe_cc = "IT" - min_time = datetime(2000, 1, 1, 23, 42, 00) - rows: List[Tuple[datetime, str, str, str]] = [] - for i in range(10): - rows.append( - (min_time + timedelta(seconds=i), "2000-02-01", test_name, probe_cc) - ) - min_time = rows[-1][0] - for i in range(10): - rows.append((min_time + timedelta(seconds=i), bucket_date, test_name, probe_cc)) - - db.execute( - "INSERT INTO test_range (created_at, bucket_date, test_name, probe_cc) VALUES", - rows, - ) - prev_range = get_prev_range( - db, - "test_range", - test_name=[test_name], - bucket_date=bucket_date, - probe_cc=[probe_cc], - ) - assert prev_range.min_created_at and prev_range.max_created_at - assert prev_range.min_created_at == (min_time - timedelta(seconds=1)).strftime( - TS_FORMAT - ) - assert prev_range.max_created_at == (rows[-1][0] + timedelta(seconds=1)).strftime( - TS_FORMAT - ) - - maybe_delete_prev_range( - db=db, - prev_range=prev_range, - ) - wait_for_mutations(db, "test_range") - res = db.execute("SELECT COUNT() FROM test_range") - assert res[0][0] == 10 - db.execute("DROP TABLE test_range") - - -def test_make_cc_batches(): - cc_batches = make_cc_batches( - cnt_by_cc={"IT": 100, "IR": 300, "US": 1000}, - probe_cc=["IT", "IR", "US"], - parallelism=2, - ) - assert len(cc_batches) == 2 - # We expect the batches to be broken up into (IT, IR), ("US") - assert any([set(x) == set(["US"]) for x in cc_batches]) == True +def get_obs_count_by_cc( + clickhouse_url: str, table_name: str, start_day: str, end_day: str +) -> Dict[str, int]: + with ClickhouseConnection(clickhouse_url) as db: + q = f""" + SELECT + probe_cc, COUNT() + FROM {table_name} + WHERE measurement_start_time > %(start_day)s AND measurement_start_time < %(end_day)s + GROUP BY probe_cc + """ + cc_list: List[Tuple[str, int]] = db.execute( + q, {"start_day": start_day, "end_day": end_day} + ) # type: ignore + assert isinstance(cc_list, list) + return dict(cc_list) def test_make_file_entry_batch(datadir, db): @@ -174,24 +82,13 @@ def test_make_file_entry_batch(datadir, db): ) assert obs_msmt_count == 453 - make_ground_truths_in_day( - MakeGroundTruthsParams( - day=date(2023, 10, 31).strftime("%Y-%m-%d"), - clickhouse=db.clickhouse_url, - data_dir=datadir, - ), - ) - analysis_res = make_analysis_in_a_day( + make_analysis_in_a_day( MakeAnalysisParams( probe_cc=["IR"], test_name=["webconnectivity"], - clickhouse=db.clickhouse_url, - data_dir=datadir, - fast_fail=False, day=date(2023, 10, 31).strftime("%Y-%m-%d"), ), ) - assert analysis_res["count"] == obs_msmt_count def test_write_observations(measurements, netinfodb, db): @@ -224,11 +121,10 @@ def test_write_observations(measurements, netinfodb, db): db.write_table_model_rows(obs_list) db.close() cnt_by_cc = get_obs_count_by_cc( - ObsCountParams( - clickhouse_url=db.clickhouse_url, - start_day="2020-01-01", - end_day="2023-12-01", - ) + clickhouse_url=db.clickhouse_url, + start_day="2020-01-01", + end_day="2023-12-01", + table_name="obs_web", ) assert cnt_by_cc["CH"] == 2 assert cnt_by_cc["GR"] == 20 @@ -322,45 +218,6 @@ async def optimize_tables_mocked(params: OptimizeTablesParams): return -@activity.defn(name="make_ground_truths_in_day") -async def make_ground_truths_in_day_mocked(params: MakeGroundTruthsParams): - return - - -@activity.defn(name="get_previous_range") -async def get_previous_range_mocked(params: GetPreviousRangeParams) -> List[PrevRange]: - return [ - PrevRange( - table_name="obs_web", - batch_parameters=BatchParameters( - test_name=[], - probe_cc=[], - bucket_date="2024-01-01", - timestamp=datetime(2024, 1, 1).strftime(TS_FORMAT), - ), - timestamp_column="timestamp", - probe_cc_column="probe_cc", - max_created_at=datetime(2024, 9, 1, 12, 34, 56).strftime(TS_FORMAT), - min_created_at=datetime(2024, 9, 1, 1, 23, 45).strftime(TS_FORMAT), - ) - ] - - -@activity.defn(name="delete_previous_range") -async def delete_previous_range_mocked(params: DeletePreviousRangeParams) -> None: - return - - -@activity.defn(name="get_obs_count_by_cc") -async def get_obs_count_by_cc_mocked(params: ObsCountParams): - return { - "AU": 90, - "IT": 1000, - "IR": 200, - "NZ": 42, - } - - @activity.defn(name="make_observations") async def make_observations_mocked( params: MakeObservationsParams, @@ -374,8 +231,8 @@ async def make_observations_mocked( @activity.defn(name="make_analysis_in_a_day") -async def make_analysis_in_a_day_mocked(params: MakeAnalysisParams) -> dict: - return {"count": 100} +async def make_analysis_in_a_day_mocked(params: MakeAnalysisParams): + pass @pytest.mark.asyncio @@ -383,13 +240,11 @@ async def test_temporal_workflows(): obs_params = ObservationsWorkflowParams( probe_cc=[], test_name=[], - clickhouse="", - data_dir="", fast_fail=False, bucket_date="2024-01-02", ) analysis_params = AnalysisWorkflowParams( - probe_cc=[], test_name=[], clickhouse="", data_dir="", day="2024-01-01" + probe_cc=[], test_name=[], day="2024-01-01" ) async with await WorkflowEnvironment.start_time_skipping() as env: async with Worker( @@ -399,12 +254,8 @@ async def test_temporal_workflows(): activities=[ optimize_tables_mocked, optimize_all_tables_mocked, - make_ground_truths_in_day_mocked, - get_obs_count_by_cc_mocked, make_analysis_in_a_day_mocked, make_observations_mocked, - get_previous_range_mocked, - delete_previous_range_mocked, ], ): res = await env.client.execute_workflow( @@ -423,7 +274,6 @@ async def test_temporal_workflows(): id="analysis-wf", task_queue=TASK_QUEUE_NAME, ) - assert res["obs_count"] == 300 assert res["day"] == "2024-01-01"