diff --git a/followthemoney/types/address.py b/followthemoney/types/address.py index eba88a315..37b8d5322 100644 --- a/followthemoney/types/address.py +++ b/followthemoney/types/address.py @@ -42,7 +42,7 @@ def clean_text( return collapsed def _specificity(self, value: str) -> float: - return dampen(10, 60, value) + return dampen(10, 30, value) def node_id(self, value: str) -> Optional[str]: slug = slugify(value) diff --git a/followthemoney/types/country.py b/followthemoney/types/country.py index 41c3ed8b3..8722bdb90 100644 --- a/followthemoney/types/country.py +++ b/followthemoney/types/country.py @@ -82,5 +82,8 @@ def clean_text( def country_hint(self, value: str) -> str: return value + def compare(self, left: str, right: str) -> float: + return 1.0 if left == right else 0.0 + def rdf(self, value: str) -> Identifier: return URIRef(f"iso-3166-1:{value}") diff --git a/followthemoney/types/email.py b/followthemoney/types/email.py index 275980a1a..bfd80bc20 100644 --- a/followthemoney/types/email.py +++ b/followthemoney/types/email.py @@ -79,5 +79,10 @@ def clean_text( # def country_hint(self, value) # TODO: do we want to use TLDs as country evidence? + def compare(self, left: str, right: str) -> float: + if left.lower() == right.lower(): + return 1.0 + return 0.0 + def rdf(self, value: str) -> Identifier: return URIRef("mailto:%s" % value.lower()) diff --git a/followthemoney/types/entity.py b/followthemoney/types/entity.py index 72758a658..52081b12f 100644 --- a/followthemoney/types/entity.py +++ b/followthemoney/types/entity.py @@ -64,6 +64,9 @@ def clean_text( return text return None + def compare(self, left: str, right: str) -> float: + return 1.0 if left == right else 0.0 + def rdf(self, value: str) -> Identifier: return URIRef(f"entity:{value}") diff --git a/followthemoney/types/gender.py b/followthemoney/types/gender.py index 133d0ed48..81626f3a2 100644 --- a/followthemoney/types/gender.py +++ b/followthemoney/types/gender.py @@ -61,5 +61,8 @@ def clean_text( return None return code + def compare(self, left: str, right: str) -> float: + return 1.0 if left == right else 0.0 + def rdf(self, value: str) -> Identifier: return URIRef(f"gender:{value}") diff --git a/followthemoney/types/iban.py b/followthemoney/types/iban.py index 485cdccfd..305fc5fcb 100644 --- a/followthemoney/types/iban.py +++ b/followthemoney/types/iban.py @@ -47,6 +47,9 @@ def clean_text( def country_hint(self, value: str) -> str: return value[:2].lower() + def compare(self, left: str, right: str) -> float: + return 1.0 if left == right else 0.0 + def rdf(self, value: str) -> Identifier: return URIRef(self.node_id(value)) diff --git a/followthemoney/types/ip.py b/followthemoney/types/ip.py index 628fdcabb..de6bd320f 100644 --- a/followthemoney/types/ip.py +++ b/followthemoney/types/ip.py @@ -43,5 +43,8 @@ def clean_text( except ValueError: return None + def _specificity(self, value: str) -> float: + return 1.0 + def rdf(self, value: str) -> Identifier: return URIRef(f"ip:{value}") diff --git a/followthemoney/types/language.py b/followthemoney/types/language.py index 74993f86b..d28d89a30 100644 --- a/followthemoney/types/language.py +++ b/followthemoney/types/language.py @@ -113,5 +113,8 @@ def clean_text( return None return code + def _specificity(self, value: str) -> float: + return 1.0 + def rdf(self, value: str) -> Identifier: return URIRef(f"iso-639:{value}") diff --git a/followthemoney/types/mimetype.py b/followthemoney/types/mimetype.py index 41c5c2dea..bbc825f7e 100644 --- a/followthemoney/types/mimetype.py +++ b/followthemoney/types/mimetype.py @@ -37,6 +37,9 @@ def clean_text( return text return None + def _specificity(self, value: str) -> float: + return 1.0 + def rdf(self, value: str) -> Identifier: return URIRef(f"urn:mimetype:{value}") diff --git a/followthemoney/types/string.py b/followthemoney/types/string.py index 7ac991f5b..13b64a2c1 100644 --- a/followthemoney/types/string.py +++ b/followthemoney/types/string.py @@ -38,3 +38,6 @@ class HTMLType(StringType): label = _("HTML") plural = _("HTMLs") max_size = 30 * MEGABYTE + + def compare(self, left: str, right: str) -> float: + return 0.0 diff --git a/followthemoney/types/topic.py b/followthemoney/types/topic.py index 70a5da65a..14dad1e7f 100644 --- a/followthemoney/types/topic.py +++ b/followthemoney/types/topic.py @@ -70,5 +70,8 @@ class TopicType(EnumType): def _locale_names(self, locale: Locale) -> EnumValues: return {k: gettext(v) for (k, v) in self._TOPICS.items()} + def compare(self, left: str, right: str) -> float: + return 1.0 if left == right else 0.0 + def rdf(self, value: str) -> Identifier: return URIRef(f"ftm:topic:{value}") diff --git a/followthemoney/types/url.py b/followthemoney/types/url.py index 4e14a0b0f..3144e2e9c 100644 --- a/followthemoney/types/url.py +++ b/followthemoney/types/url.py @@ -54,8 +54,10 @@ def clean_text( parsed = parsed._replace(path="/") return parsed.geturl() - def _specificity(self, value: str) -> float: - return dampen(10, 120, value) + def compare(self, left: str, right: str) -> float: + if left.lower() == right.lower(): + return 1.0 + return 0.0 def rdf(self, value: str) -> Identifier: return URIRef(value)