Skip to content

Commit

Permalink
Merge pull request #70 from wafflestudio/dev
Browse files Browse the repository at this point in the history
2024 2월 정식 버전 배포
  • Loading branch information
GanziDaeyong authored Feb 26, 2024
2 parents 3fd6a89 + 9bce8aa commit 928730d
Show file tree
Hide file tree
Showing 6 changed files with 98 additions and 72 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ GitHub Flow + Issue based branch 방식을 사용합니다.
- GitHub Flow는 [여기](https://medium.com/@patrickporto/4-branching-workflows-for-git-30d0aaee7bf) 참고
- 개발이 필요한 사항은 우선 issue에 올리고, 해당 issue 번호로 branch를 만듭니다.
- 예시 브랜치) feat/14-crawling-debugging
- new PR -> dev 브랜치로 merge -> dev 브랜치가 테스트 통과하면 prod 브랜치로 merge
- 개별 브랜치에서 dev로 PR -> approve 받기 -> squash merge -> dev 정상 동작 확인 -> dev에서 prod로 pr -> merge commit -> prod 정상 동작 확인
- auto delete head branch 옵션을 꺼두었습니다. 개발이 완료된 개별 브랜치는 꼭 직접 삭제해주세요.

### Code Styles
- [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html)
Expand Down
27 changes: 20 additions & 7 deletions crawlers/base_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,19 @@ class Meal:
BR = "BR"
LU = "LU"
DN = "DN"
type_handler = {BR: BR, LU: LU, DN: DN, "아침": BR, "점심": LU, "저녁": DN, "중식": LU, "석식": DN}
type_handler = {
BR: BR,
LU: LU,
DN: DN,
"아침": BR,
"점심": LU,
"저녁": DN,
"중식": LU,
"석식": DN,
"breakfast": BR,
"lunch": LU,
"dinner": DN,
}

def __init__(self, restaurant="", name="", date=None, type="", price=None, etc=None):
self.set_restaurant(restaurant)
Expand Down Expand Up @@ -171,21 +183,22 @@ async def run(self, url=None, **kwargs):
soup = BeautifulSoup(html, "html.parser")
self.crawl(soup, **kwargs)
except Exception as e:
print(e)
print(f"Error in Run: {str(e)}")

def normalize(self, meal, **kwargs):
for normalizer_cls in self.normalizer_classes:
meal = normalizer_cls().normalize(meal, **kwargs)
return meal

def is_meal_name(self, name):
name = text_normalizer(name, True)
if not name:
def is_meal_name_when_normalized(self, name):
normalized_name = text_normalizer(name, True)
if not normalized_name or normalized_name == "메뉴":
return False
return name and all(re.match(".*" + p + ".*", name) is None for p in self.not_meal)
is_meal_name = all(re.match(".*" + p + ".*", normalized_name) is None for p in self.not_meal)
return is_meal_name

def found_meal(self, meal):
if meal and self.is_meal_name(meal.name):
if meal and self.is_meal_name_when_normalized(meal.name):
self.meals.append(meal)

@abstractmethod
Expand Down
46 changes: 26 additions & 20 deletions crawlers/snuco_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,17 @@ class SnucoRestaurantCrawler(RestaurantCrawler):

def __init__(self):
super().__init__()
self.not_meal += ["셋트메뉴", "단품메뉴", "사이드메뉴", "결제", "혼잡시간", r"말렌카케이크", "1조각홀케이크", "식사"]
self.not_meal += [
"셋트메뉴",
"단품메뉴",
"사이드메뉴",
"결제",
"혼잡시간",
r"말렌카케이크",
"1조각홀케이크",
"식사",
"사이드",
]

def is_next_line_keyword(self, meal):
if not meal:
Expand All @@ -84,7 +94,7 @@ def is_next_line_keyword(self, meal):
return any((str == code) for str in self.next_line_str) or any((str in code) for str in self.next_line_keyword)

def filter_menu_names(self, meal_names: list):
return [name for name in meal_names if self.is_meal_name(name)]
return [name for name in meal_names if self.is_meal_name_when_normalized(name)]

def filter_and_split_menu_names(self, meal_name: list):
names = []
Expand Down Expand Up @@ -133,29 +143,20 @@ async def run_30days(self):
async def run(self, date=None, **kwargs):
if not date:
date = datetime.datetime.now(timezone("Asia/Seoul")).date()
url = (
self.url
+ f"?field_menu_date_value_1%5Bvalue%5D%5Bdate%5D=&field_menu_date_value%5Bvalue%5D%5Bdate%5D={date.month}%2F{date.day}%2F{date.year}"
)
url = self.url + f"?date={date.year}-{date.month:02d}-{date.day:02d}"
await super().run(url, date=date, **kwargs)

def found_meal(self, meal):
if meal and self.is_meal_name(meal.name) and "교직" not in meal.name:
if meal and self.is_meal_name_when_normalized(meal.name) and "교직" not in meal.name:
self.meals.append(meal)

def crawl(self, soup, **kwargs):
date = kwargs.get("date", datetime.datetime.now(timezone("Asia/Seoul")).date())
table = soup.select_one("div.view-content > table")
table = soup.find("table", {"class": "menu-table"})
if not table:
return

ths = table.select("thead > tr > th")
trs = table.tbody.find_all("tr", recursive=False)

types = []
for th in ths[1:]:
types.append(th.text)

for tr in trs:
tds = tr.find_all("td", recursive=False)
row_restaurant = tds[0].text
Expand All @@ -164,7 +165,11 @@ def crawl(self, soup, **kwargs):
for except_restaurant_name in self.except_restaurant_name_list
):
continue

for col_idx, td in enumerate(tds[1:]):
# meal type이 더 이상 ths에 포함되지 않고 tds 내부로 이동.
meal_type = td["class"][0]

# td.text에서 식단을 한번에 가져오는 것으로 변경
names = td.text.split("\n")
restaurant = text_normalizer(row_restaurant)
Expand All @@ -177,10 +182,10 @@ def crawl(self, soup, **kwargs):
filtered_names = self.filter_menu_names(names)

for name in filtered_names:
meal = Meal(restaurant, name, date, types[col_idx])
meal = Meal(restaurant, name, date, meal_type)
meal = self.normalize(meal)
# is_meal_name에서 normalizer도 호출한다.
if self.is_meal_name(meal.name):

if self.is_meal_name_when_normalized(meal.name):
# ISSUE#54 220동 이름 오류 수정
# ex) ㅁ 바비든든( ~ ): 덮밥류 -> 바비든든: 덮밥류
if meal.restaurant == "220동식당":
Expand All @@ -191,16 +196,17 @@ def crawl(self, soup, **kwargs):

# 교직원 식당 이름 설정을 위한 로직
if (
meal.restaurant == "자하연식당"
(meal.restaurant == "자하연식당 3층" or "자하연식당" in meal.restaurant)
and last_meal
and ("교직" in last_meal.name or "교직" in last_meal.restaurant)
) or meal.restaurant in self.jaha_faculty_keyword:
meal.set_restaurant("자하연식당>3층교직메뉴")
meal.set_restaurant("자하연식당 3층")

# 다음 한줄만 추가하는 경우
if not next_line_merged and self.is_next_line_keyword(last_meal):
last_meal = self.combine(last_meal, meal)
next_line_merged = True

else:
delimiter = self.get_multi_line_delimiter(last_meal)
# delimiter에 해당하는 경우에는 여기 걸림
Expand All @@ -219,7 +225,7 @@ def crawl(self, soup, **kwargs):
next_line_merged = False
elif self.get_multi_line_delimiter(last_meal) is None:
if meal.restaurant != restaurant:
meal = Meal(row_restaurant, name, date, types[col_idx])
meal = Meal(row_restaurant, name, date, meal_type)
meal = self.normalize(meal)
restaurant = meal.restaurant
self.found_meal(last_meal)
Expand Down
4 changes: 2 additions & 2 deletions crawlers/vet_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@


class VetRestaurantCrawler(RestaurantCrawler):
url = "http://vet.snu.ac.kr/node/152"
url = "https://vet.snu.ac.kr/금주의-식단/"
restaurant = "수의대식당"

async def run_30days(self):
return await asyncio.gather(self.run(), return_exceptions=True)

def crawl(self, soup, **kwargs):
soup.div.extract()
trs = soup.select("table > tbody > tr")
trs = soup.select("table > thead > tr")

types = [th.text for th in trs[0].find_all("th")[1:]]

Expand Down
52 changes: 12 additions & 40 deletions handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,17 @@
from itertools import compress
import asyncio
import argparse
from slack import send_slack_message
from crawlers.base_crawler import text_normalizer
from crawlers.vet_crawler import VetRestaurantCrawler
from crawlers.snudorm_crawler import SnudormRestaurantCrawler
from crawlers.snuco_crawler import SnucoRestaurantCrawler
from slack import (
send_new_restaurants_message,
send_deleted_menus_message,
send_new_menus_message,
send_edited_menus_message,
_send_slack_message,
)


def compare_restaurants(db_restaurants, crawled_meals):
Expand Down Expand Up @@ -71,15 +77,6 @@ def compare_menus(db_menus, crawled_meals, restaurants):
)


def send_new_restaurants_message(new_restaurants):
print(f"New restaurants: {repr(new_restaurants)}")
if new_restaurants:
slack_message = f"{len(new_restaurants)} new restaurants found: "
for restaurant in new_restaurants:
slack_message = slack_message + '"' + restaurant.get("name_kr") + '" '
send_slack_message(slack_message)


def restaurants_transaction(crawled_meals, cursor):
get_restaurants_query = """
SELECT code
Expand All @@ -97,30 +94,6 @@ def restaurants_transaction(crawled_meals, cursor):
print("Restaurants checked")


def send_deleted_menus_message(deleted_menus):
print(f"Menus deleted: {repr(deleted_menus)}")
if deleted_menus:
send_slack_message(f"{len(deleted_menus)} menus deleted: {repr(deleted_menus)}")


def send_new_menus_message(new_menus):
slack_message = f"{len(new_menus)} new menus found: "
for menu in new_menus:
name_kr = menu.get("name_kr")
if ":" in name_kr:
slack_message = slack_message + '*"' + menu.get("name_kr") + '"* '
else:
slack_message = slack_message + '"' + menu.get("name_kr") + '" '
send_slack_message(slack_message)
print(f"New menus found: {repr(new_menus)}")


def send_edited_menus_message(edited_menus):
print(f"Menus edited: {repr(edited_menus)}")
if edited_menus:
send_slack_message(f"{len(edited_menus)} menus edited: {repr(edited_menus)}")


def menus_transaction(crawled_meals, cursor):
get_restaurants_query = """
SELECT id, code
Expand All @@ -136,32 +109,31 @@ def menus_transaction(crawled_meals, cursor):
"""
cursor.execute(get_menus_query)
db_menus = cursor.fetchall()

new_menus, deleted_menus, edited_menus = compare_menus(db_menus, crawled_meals, restaurants)

send_deleted_menus_message(deleted_menus)
if deleted_menus:
deleted_menus_id = [str(menu.get("id")) for menu in deleted_menus]
delete_menus_query = f"""
DELETE FROM menu
WHERE id in ({','.join(deleted_menus_id)});
"""
cursor.execute(delete_menus_query)
send_deleted_menus_message(deleted_menus)

send_new_menus_message(new_menus)
insert_menus_query = """
INSERT INTO menu(restaurant_id, code, date, type, name_kr, price, etc)
VALUES (%(restaurant_id)s, %(code)s, %(date)s, %(type)s, %(name_kr)s, %(price)s, %(etc)s);
"""
cursor.executemany(insert_menus_query, new_menus)
send_new_menus_message(new_menus)

send_edited_menus_message(edited_menus)
edited_menus_query = """
UPDATE menu
SET price=%(price)s, etc=%(etc)s, name_kr=%(name_kr)s
WHERE id=%(id)s;
"""
cursor.executemany(edited_menus_query, edited_menus)
send_edited_menus_message(edited_menus)

print("Menus checked")

Expand Down Expand Up @@ -232,12 +204,12 @@ def crawl(event, context):
menus_transaction(crawled_meals, cursor)
siksha_db.commit()

send_slack_message("Crawling has been successfully done")
_send_slack_message("Crawling has been successfully done")
return "Crawling has been successfully done"
except Exception as e:
siksha_db.rollback()
print(e)
send_slack_message("Crawling has been failed")
_send_slack_message("Crawling has been failed")
return "Crawling has been failed"
finally:
cursor.close()
Expand Down
38 changes: 36 additions & 2 deletions slack.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,41 @@
import os


def send_slack_message(msg):
body = {"channel": os.environ["SLACK_CHANNEL"], "text": msg}
def _send_slack_message(message: str):
body = {"channel": os.environ["SLACK_CHANNEL"], "text": message}
headers = {"Authorization": f'Bearer {os.environ["SLACK_TOKEN"]}'}
requests.post("https://slack.com/api/chat.postMessage", headers=headers, data=body, timeout=100)


def send_deleted_menus_message(menus: list):
message = f"{len(menus)} menus deleted: \n" + build_body_message(menus)
_send_slack_message(message)
print(f"Menus deleted: {repr(menus)})")


def send_new_menus_message(menus: list):
message = f"{len(menus)} new menus found: \n" + build_body_message(menus)
_send_slack_message(message)
print(f"New menus found: {repr(menus)})")


def send_edited_menus_message(menus: list):
message = f"{len(menus)} menus edited: \n" + build_body_message(menus)
_send_slack_message(message)
print(f"Menus edited: {repr(menus)})")


def send_new_restaurants_message(restaurants: list):
slack_message = f"{len(restaurants)} new restaurants found: \n" + build_body_message(restaurants)
if restaurants:
_send_slack_message(slack_message)
print(f"New restaurants: {repr(restaurants)}")


def build_body_message(menus_or_restaurants: list):
body_message = ""
for i, menu_or_restaurant in enumerate(menus_or_restaurants):
body_message += f'"{menu_or_restaurant.get("name_kr")}", '
if i % 5 == 4:
body_message += "\n"
return body_message

0 comments on commit 928730d

Please sign in to comment.