Skip to content

Commit

Permalink
feat: add google place ids to bars
Browse files Browse the repository at this point in the history
  • Loading branch information
SchutteJan committed Jul 23, 2024
1 parent 29463bb commit 665aebe
Show file tree
Hide file tree
Showing 8 changed files with 346 additions and 7 deletions.
3 changes: 2 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,10 @@ repos:
- id: hadolint-docker

- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.5.1
rev: v0.5.4
hooks:
- id: ruff
args: [ "--fix" ]
- id: ruff-format

- repo: local
Expand Down
2 changes: 2 additions & 0 deletions data/bars/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
Verleende exploitatievergunningen horeca met terrasgrenzen en ontheffingen
https://data.amsterdam.nl/data/datasets/GsY50tEkoJKCGw/verleende-exploitatievergunningen-horeca-met-terrasgrenzen-en-ontheffingen?term=Verleende+exploitatievergunningen+horeca+met+terrasgrenzen+en+ontheffingen

https://www.amsterdam.nl/ondernemen/horeca/horeca-kaart/

https://api.data.amsterdam.nl/dcatd/datasets/GsY50tEkoJKCGw


Expand Down
123 changes: 123 additions & 0 deletions data/bars/gmaps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import math
from typing import Tuple, Optional, Dict
from diskcache import Cache

import googlemaps
import os

cache = Cache("./gmaps_cache")


ALLOWED_TYPES = ["bar", "cafe", "restaurant", "establishment"]


def get_gmaps_client():
GMAPS_API: str = os.environ.get("GMAPS_API")
return googlemaps.Client(key=GMAPS_API)


# returns distance in meters between two lat/long points
def get_distance(a: Tuple[float, float], b: Tuple[float, float]) -> float:
# Radius of the Earth in meters
R = 6371000

# Convert latitude and longitude from degrees to radians
lat1, lon1 = math.radians(a[0]), math.radians(a[1])
lat2, lon2 = math.radians(b[0]), math.radians(b[1])

# Differences in coordinates
dlat = lat2 - lat1
dlon = lon2 - lon1

# Haversine formula
h = (
math.sin(dlat / 2) ** 2
+ math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2) ** 2
)
c = 2 * math.atan2(math.sqrt(h), math.sqrt(1 - h))

# Distance in meters
distance = R * c

return distance


@cache.memoize(typed=True)
def gmaps_place_search(
address: str, location: Tuple[float, float], type: Optional[str]
) -> dict:
gmaps = get_gmaps_client()
return gmaps.places(address, location=location, type=type)


def get_likeliest_place(
expected_name: str, address: str, location: Tuple[float, float]
) -> Optional[Dict]:
# Replace - with a space, as Google doesn't format addresses with dashes and it seems to improve search results
address = address.replace("-", " ")

# location latitude/longitude
results = gmaps_place_search(
f"{expected_name}, {address}, Amsterdam", location=location, type=None
)

if len(results["results"]) == 0:
print(f"No results found for {expected_name} on {address}")
return None

# We assume the first result is the likeliest match
result = results["results"][0]

# Check the distance isn't too big
distance = get_distance(
location,
(result["geometry"]["location"]["lat"], result["geometry"]["location"]["lng"]),
)

# This seems large, but there can be quite a discrepancy in the location of the bar as known by
# Gemeente Amsterdam and Google Maps. Example: 't Blauwe Theehuis
if distance > 300:
print(
f"Distance between '{expected_name}' and '{result['name']}' is too large ({distance}) meters"
)
return None

# result has any of the ALLOWED_TYPES as type
if not any(t in result["types"] for t in ALLOWED_TYPES):
print(
f"Result for '{expected_name}' is '{result['name']}' and is not a bar/cafe, but a {result['types']}"
)
return None

if result["business_status"] != "OPERATIONAL":
print(
f"Result for '{expected_name}' is '{result['name']}' but is not in business"
)
return None

# print(json.dumps(results, indent=2))
return result


if __name__ == "__main__":
# 't Blauwe theehuis has slightly different name according to Google, and has a slightly different geolocation
# get_likeliest_place("'t Blauwe Theehuis", "Vondelpark 5 A", location=(52.35835998002426, 4.870637924609814))

# A location that should not be in our results because it doesn't have the bar type on Google Maps
# get_likeliest_place("Koffiehuis De Hoek", "Prinsengracht 341-H",
# location=(52.372331919453636, 4.883287629621083))

# "Amigo Cafe" which doesn't exist on this address according to google. There is a cafe called
# "Brakke grond" however that has been closed recently.
# get_likeliest_place("Amigo Cafe", "Rozengracht 16", location=(52.373917975661, 4.8821682457514))

# Vondelpark 3 has been renamed to "Park Zuid"
# get_likeliest_place("Vondelpark 3", "Vondelpark 3", location=(52.36105259638123, 4.874974382779035))

# Café Oost is a bar, but is listed as "cafe" on Google Maps (does not show up in type="something" results that
# are not exact matches)
# get_likeliest_place("Café Oost", "Krugerplein 40", location=(52.3534727, 4.9198154))

# Arie Goudvisch is definitely a bar, but because the address has a -, Google can't find it :S
# get_likeliest_place("Arie Goudvisch", "Ferdinand Bolstraat 24-H", location=(52.356687726284804, 4.890481919493629))
pass
2 changes: 1 addition & 1 deletion data/bars/insert-bars.sql.j2
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ VALUES (
'1',
'{{ f.properties.zaak_specificatie }}',
NULL,
NULL,
{{ f.properties.format_sql_google_place_id() }},
NULL,
'{{ f.properties.adres | replace("'", "''") }}'
);
Expand Down
34 changes: 31 additions & 3 deletions data/bars/render_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
)
import json

from gmaps import get_likeliest_place

GeoJsonStr = str

MANUAL_ZAAK_NAAM_REPLACEMENTS: Final[Dict[str, str]] = {
Expand Down Expand Up @@ -65,6 +67,14 @@ class Properties:
status_verlenging_tijdelijk_terras: Optional[str]
verlenging_tijdelijk_terras_details: Optional[str]

# Additional properties added by enriching with Google Maps
google_place_id: Optional[str] = None

def format_sql_google_place_id(self) -> str:
if self.google_place_id:
return f"'{self.google_place_id}'"
return "NULL"

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "Properties":
return cls(**data)
Expand Down Expand Up @@ -135,6 +145,22 @@ def filter_hotels(f: Feature) -> Tuple[bool, Feature]:
return "hotel" not in f.properties.zaaknaam.lower(), f


def filter_and_enrich_using_gmaps(f: Feature) -> Tuple[bool, Feature]:
lng, lat = json.loads(f.geometry)["coordinates"]
if not f.properties.zaaknaam:
return False, f

result = get_likeliest_place(f.properties.zaaknaam, f.properties.adres, (lat, lng))
if result is None:
return False, f

# print(f.properties.zaaknaam, " -> ", result["name"])
f.properties.zaaknaam = result["name"]
f.properties.google_place_id = result["place_id"]

return True, f


def manual_substitutions_zaaknaam(f: Feature) -> Tuple[bool, Feature]:
if f.properties.zaaknaam in MANUAL_ZAAK_NAAM_REPLACEMENTS:
f.properties.zaaknaam = MANUAL_ZAAK_NAAM_REPLACEMENTS[f.properties.zaaknaam]
Expand Down Expand Up @@ -165,6 +191,7 @@ def prepare_data(data: Iterable[Feature]) -> List[Feature]:
filter_manual_exclusions,
manual_substitutions_zaaknaam,
beautify_zaaknaam,
filter_and_enrich_using_gmaps,
]

# Some restaurants are also cafes, we try to pull some additional cafes from there
Expand Down Expand Up @@ -192,9 +219,10 @@ def main() -> None:
print(f"Data points before filtering: {len(dataset)}")
prepared_data = prepare_data(dataset)
print(f"Data points after filtering: {len(prepared_data)}")
# for d in prepared_data:
# print(d.properties.zaaknaam, ":", d.properties.zaak_specificatie)
print(template.render(features=prepared_data))
sql = template.render(features=prepared_data)

print("Wrote to bars-export.sql")
Path("bars-export.sql").write_text(sql)


if __name__ == "__main__":
Expand Down
Loading

0 comments on commit 665aebe

Please sign in to comment.