Skip to content

Commit

Permalink
Fix: SNU dorm crawl (#98)
Browse files Browse the repository at this point in the history
* fix: Snudorm Crawler Breaking Changes and Minor Bug fix

* lint
  • Loading branch information
GoGiants1 authored Nov 11, 2024
1 parent b6a51f1 commit b9eef6e
Show file tree
Hide file tree
Showing 6 changed files with 395 additions and 141 deletions.
59 changes: 58 additions & 1 deletion crawlers/base_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,63 @@ def normalize(self, meal, **kwargs):
return meal


class RemoveMealNumber(MealNormalizer):
def normalize(self, meal, **kwargs):
if "①" in meal.name or "②" in meal.name:
meal.set_name(meal.name.replace("①", ""))
meal.set_name(meal.name.replace("②", ""))
return meal


class RemoveMealIdentifierFromMealName(MealNormalizer):
identifiers = ["(잇템)"]

def normalize(self, meal, **kwargs):
for identifier in self.identifiers:
meal.set_name(meal.name.replace(identifier, ""))
return meal


class RemoveInfoFromMealName(MealNormalizer):
info_sign = ["※", "►", "※", "브레이크 타임"]

def normalize(self, meal, **kwargs):
meal.set_name(re.sub("(" + "|".join(self.info_sign) + ").*", "", meal.name))
return meal


class FindRestaurantDetail(MealNormalizer):
restaurant_regex = [
r"(.*)\( ?(\d층.*)\)(.*)",
r"(.*)\((.*식당) ?\)(.*)",
r"(.*)< ?(\d층.*)>(.*)",
r"(.*)<(.*식당) ?>(.*)",
r"(.*)<(테이크아웃)>(.*)",
]

def normalize(self, meal, **kwargs):
for regex in self.restaurant_regex:
m = re.match(regex, meal.name)
if m:
meal.set_restaurant(meal.restaurant + ">" + m.group(2).strip())
meal.set_name(m.group(1).strip() + m.group(3).strip())
return meal


class AddRestaurantDetail(MealNormalizer):
def normalize(self, meal, **kwargs):
details = kwargs.get("restaurant_detail", [])
final_restaurants = kwargs.get("final_restaurants", [])
restaurant = meal.restaurant
for detail in details:
restaurant = restaurant + ">" + detail
if text_normalizer(detail, True) in final_restaurants:
break
meal.set_restaurant(restaurant)

return meal


class RestaurantCrawler(metaclass=ABCMeta):
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0"}
url = ""
Expand Down Expand Up @@ -188,7 +245,7 @@ def __init__(self):
self.meals = []

@abstractmethod
async def run_30days(self):
async def run_7days(self):
pass

async def run(self, url=None, **kwargs):
Expand Down
108 changes: 108 additions & 0 deletions crawlers/misc/snudorm_crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import asyncio
import urllib3
import aiohttp
from bs4 import BeautifulSoup
import datetime
from pytz import timezone

from crawlers.base_crawler import (
MealNormalizer,
RestaurantCrawler,
Meal,
text_normalizer,
FindPrice,
FindParenthesisHash,
)


class AddRestaurantDetail(MealNormalizer):
def normalize(self, meal, **kwargs):
details = kwargs.get("restaurant_detail", [])
final_restaurants = kwargs.get("final_restaurants", [])
restaurant = meal.restaurant
for detail in details:
restaurant = restaurant + ">" + detail
if text_normalizer(detail, True) in final_restaurants:
break
meal.set_restaurant(restaurant)
return meal


class SnudormRestaurantCrawler(RestaurantCrawler):
url = "https://snudorm.snu.ac.kr/wp-admin/admin-ajax.php"
menucost_url = "https://snudorm.snu.ac.kr/food-schedule/"
restaurant = "기숙사식당"
normalizer_classes = [FindPrice, FindParenthesisHash, AddRestaurantDetail]

async def get_menucosts(self):
urllib3.disable_warnings()
async with aiohttp.ClientSession(headers=self.headers, connector=aiohttp.TCPConnector(ssl=False)) as session:
async with session.get(self.menucost_url) as response:
html = await response.read()
soup = BeautifulSoup(html, "html.parser")
lis = soup.select("div.board > ul > li")
prices = {}
for li in lis:
spans = li.find_all("span")
prices[spans[0].text] = spans[1].text
return prices

async def run_30days(self):
date = datetime.datetime.now(timezone("Asia/Seoul")).date()
menucosts = await self.get_menucosts()
tasks = [self.run(date=date + datetime.timedelta(weeks=i), menucosts=menucosts) for i in range(4)]
return await asyncio.gather(*tasks, return_exceptions=True)

async def run(self, date=None, menucosts=None, **kwargs):
if not date:
date = datetime.datetime.now(timezone("Asia/Seoul")).date()
if not menucosts:
menucosts = await self.get_menucosts()
urllib3.disable_warnings()
async with aiohttp.ClientSession(headers=self.headers, connector=aiohttp.TCPConnector(ssl=False)) as session:
data = {
"action": "metapresso_dorm_food_week_list",
"start_week_date": date.isoformat(),
"target_blog": "39",
}
async with session.post(self.url, data=data) as response:
html = await response.read()
soup = BeautifulSoup(html, "html.parser")
self.crawl(soup, menucosts=menucosts, **kwargs)

def crawl(self, soup, menucosts=None, **kwargs):
if not menucosts:
menucosts = {}

trs = soup.select("table > tbody > tr")
ths = soup.select("table > thead > tr > th")
dates = [th.text for th in ths[-7:]]
type = ""
restaurant_detail = [[] for _ in range(len(trs))]

for row_idx, tr in enumerate(trs):
tds = tr.select("td")

for td in tds[:-7]:
rowspan = td.attrs.get("rowspan")
rowspan = int(rowspan[0]) if rowspan else 1
type_tmp = text_normalizer(td.text)
if type_tmp in Meal.type_handler:
type = type_tmp
else:
for i in range(rowspan):
restaurant_detail[row_idx + i].append(td.text)

for col_idx, td in enumerate(tds[-7:]):
ul = td.find("ul")
if ul:
for li in ul.find_all("li", recursive=False):
spans = li.find_all("span")
name = spans[-1].text
price = menucosts.get(spans[0].text)
restaurant = self.restaurant
meal = Meal(restaurant, name, dates[col_idx], type, price)
meal = self.normalize(
meal, restaurant_detail=restaurant_detail[row_idx], final_restaurants=["아워홈"]
)
self.found_meal(meal)
67 changes: 25 additions & 42 deletions crawlers/snuco_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,47 +7,15 @@
from crawlers.base_crawler import (
FindParenthesisHash,
FindPrice,
FindRestaurantDetail,
Meal,
MealNormalizer,
RemoveInfoFromMealName,
RemoveMealNumber,
RestaurantCrawler,
text_normalizer,
)


class RemoveMealNumber(MealNormalizer):
def normalize(self, meal, **kwargs):
if "①" in meal.name or "②" in meal.name:
meal.set_name(meal.name.replace("①", ""))
meal.set_name(meal.name.replace("②", ""))
return meal


class RemoveInfoFromMealName(MealNormalizer):
info_sign = ["※", "►", "※", "브레이크 타임"]

def normalize(self, meal, **kwargs):
meal.set_name(re.sub("(" + "|".join(self.info_sign) + ").*", "", meal.name))
return meal


class FindRestaurantDetail(MealNormalizer):
restaurant_regex = [
r"(.*)\( ?(\d층.*)\)(.*)",
r"(.*)\((.*식당) ?\)(.*)",
r"(.*)< ?(\d층.*)>(.*)",
r"(.*)<(.*식당) ?>(.*)",
r"(.*)<(테이크아웃)>(.*)",
]

def normalize(self, meal, **kwargs):
for regex in self.restaurant_regex:
m = re.match(regex, meal.name)
if m:
meal.set_restaurant(meal.restaurant + ">" + m.group(2).strip())
meal.set_name(m.group(1).strip() + m.group(3).strip())
return meal


class SnucoRestaurantCrawler(RestaurantCrawler):
url = "https://snuco.snu.ac.kr/foodmenu/"
normalizer_classes = [
Expand All @@ -57,9 +25,21 @@ class SnucoRestaurantCrawler(RestaurantCrawler):
RemoveInfoFromMealName,
RemoveMealNumber,
]
next_line_str = ["봄", "소반", "콤비메뉴", "셀프코너", "채식뷔페", "추가코너", "돈까스비빔면셋트", "탄탄비빔면셋트"]
next_line_str = [
"봄",
"소반",
"콤비메뉴",
"셀프코너",
"채식뷔페",
"추가코너",
"돈까스비빔면셋트",
"탄탄비빔면셋트",
]
next_line_keyword = ["지역맛집따라잡기", "호구셋트"] # 다음 한 줄 있는 것들
multi_line_keywords = {"+": ["셀프코너", "채식뷔페", "뷔페"], " / ": ["추가코너"]} # 다음에 여러줄 있는 것들
multi_line_keywords = {
"+": ["셀프코너", "채식뷔페", "뷔페"],
" / ": ["추가코너"],
} # 다음에 여러줄 있는 것들
multi_line_finisher = {
"셀프코너": "주문식메뉴"
} # multiline이 끝나는 지표. ex. 로직상 주문식 메뉴까지 append된 뒤에 확인한다. 따라서 마지막에 주문식 메뉴 따로 빼줘야함
Expand Down Expand Up @@ -115,7 +95,10 @@ def get_multi_line_delimiter(self, meal):
if not meal:
return None
code = text_normalizer(meal.name, True)
for keyword, finisher in self.multi_line_finisher.items(): # finisher 발견되면 delimiter가 없는 것 취급
for (
keyword,
finisher,
) in self.multi_line_finisher.items(): # finisher 발견되면 delimiter가 없는 것 취급
if keyword in code and finisher in code:
return None
for delimiter, keywords in self.multi_line_keywords.items():
Expand All @@ -133,9 +116,9 @@ def combine(self, last_meal, meal, delimiter=": "):
last_meal.set_price(meal.price)
return last_meal

async def run_30days(self):
async def run_7days(self):
date = datetime.datetime.now(timezone("Asia/Seoul")).date()
tasks = [self.run(date=date + datetime.timedelta(days=i)) for i in range(30)]
tasks = [self.run(date=date + datetime.timedelta(days=i)) for i in range(7)]
return await asyncio.gather(*tasks, return_exceptions=True)

async def run(self, date=None, **kwargs):
Expand All @@ -148,8 +131,8 @@ def found_meal(self, meal):
if meal and self.is_meal_name_when_normalized(meal.name) and "교직" not in meal.name:
self.meals.append(meal)

def get_name_from_raw_restaurant(self, row_restaurant):
normalized = text_normalizer(row_restaurant)
def get_name_from_raw_restaurant(self, raw_restaurant):
normalized = text_normalizer(raw_restaurant)
phone_match = re.match(r".*\((\d+-\d+)\)", normalized)

if phone_match is None:
Expand Down
Loading

0 comments on commit b9eef6e

Please sign in to comment.