Skip to content

Commit

Permalink
fix index error (#71)
Browse files Browse the repository at this point in the history
* fix index error

* fix: 더이상 ths내 type 존재 않음 (#72)

* Fix vet crawler

* 두레미담 주문식 메뉴 수정 & 자하연 식당 이름 수정

---------

Co-authored-by: iwin1203 <[email protected]>
  • Loading branch information
GoGiants1 and GanziDaeyong authored Feb 26, 2024
1 parent 88d4626 commit 9bce8aa
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 21 deletions.
18 changes: 15 additions & 3 deletions crawlers/base_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,19 @@ class Meal:
BR = "BR"
LU = "LU"
DN = "DN"
type_handler = {BR: BR, LU: LU, DN: DN, "아침": BR, "점심": LU, "저녁": DN, "중식": LU, "석식": DN}
type_handler = {
BR: BR,
LU: LU,
DN: DN,
"아침": BR,
"점심": LU,
"저녁": DN,
"중식": LU,
"석식": DN,
"breakfast": BR,
"lunch": LU,
"dinner": DN,
}

def __init__(self, restaurant="", name="", date=None, type="", price=None, etc=None):
self.set_restaurant(restaurant)
Expand Down Expand Up @@ -171,7 +183,7 @@ async def run(self, url=None, **kwargs):
soup = BeautifulSoup(html, "html.parser")
self.crawl(soup, **kwargs)
except Exception as e:
print(e)
print(f"Error in Run: {str(e)}")

def normalize(self, meal, **kwargs):
for normalizer_cls in self.normalizer_classes:
Expand All @@ -180,7 +192,7 @@ def normalize(self, meal, **kwargs):

def is_meal_name_when_normalized(self, name):
normalized_name = text_normalizer(name, True)
if not normalized_name:
if not normalized_name or normalized_name == "메뉴":
return False
is_meal_name = all(re.match(".*" + p + ".*", normalized_name) is None for p in self.not_meal)
return is_meal_name
Expand Down
38 changes: 22 additions & 16 deletions crawlers/snuco_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,17 @@ class SnucoRestaurantCrawler(RestaurantCrawler):

def __init__(self):
super().__init__()
self.not_meal += ["셋트메뉴", "단품메뉴", "사이드메뉴", "결제", "혼잡시간", r"말렌카케이크", "1조각홀케이크", "식사", "메뉴", "사이드"]
self.not_meal += [
"셋트메뉴",
"단품메뉴",
"사이드메뉴",
"결제",
"혼잡시간",
r"말렌카케이크",
"1조각홀케이크",
"식사",
"사이드",
]

def is_next_line_keyword(self, meal):
if not meal:
Expand Down Expand Up @@ -133,10 +143,7 @@ async def run_30days(self):
async def run(self, date=None, **kwargs):
if not date:
date = datetime.datetime.now(timezone("Asia/Seoul")).date()
url = (
self.url
+ f"?field_menu_date_value_1%5Bvalue%5D%5Bdate%5D=&field_menu_date_value%5Bvalue%5D%5Bdate%5D={date.month}%2F{date.day}%2F{date.year}"
)
url = self.url + f"?date={date.year}-{date.month:02d}-{date.day:02d}"
await super().run(url, date=date, **kwargs)

def found_meal(self, meal):
Expand All @@ -145,17 +152,11 @@ def found_meal(self, meal):

def crawl(self, soup, **kwargs):
date = kwargs.get("date", datetime.datetime.now(timezone("Asia/Seoul")).date())
table = soup.select_one("div.view-content > table")
table = soup.find("table", {"class": "menu-table"})
if not table:
return

ths = table.select("thead > tr > th")
trs = table.tbody.find_all("tr", recursive=False)

types = []
for th in ths[1:]:
types.append(th.text)

for tr in trs:
tds = tr.find_all("td", recursive=False)
row_restaurant = tds[0].text
Expand All @@ -164,7 +165,11 @@ def crawl(self, soup, **kwargs):
for except_restaurant_name in self.except_restaurant_name_list
):
continue

for col_idx, td in enumerate(tds[1:]):
# meal type이 더 이상 ths에 포함되지 않고 tds 내부로 이동.
meal_type = td["class"][0]

# td.text에서 식단을 한번에 가져오는 것으로 변경
names = td.text.split("\n")
restaurant = text_normalizer(row_restaurant)
Expand All @@ -177,7 +182,7 @@ def crawl(self, soup, **kwargs):
filtered_names = self.filter_menu_names(names)

for name in filtered_names:
meal = Meal(restaurant, name, date, types[col_idx])
meal = Meal(restaurant, name, date, meal_type)
meal = self.normalize(meal)

if self.is_meal_name_when_normalized(meal.name):
Expand All @@ -191,16 +196,17 @@ def crawl(self, soup, **kwargs):

# 교직원 식당 이름 설정을 위한 로직
if (
meal.restaurant == "자하연식당"
(meal.restaurant == "자하연식당 3층" or "자하연식당" in meal.restaurant)
and last_meal
and ("교직" in last_meal.name or "교직" in last_meal.restaurant)
) or meal.restaurant in self.jaha_faculty_keyword:
meal.set_restaurant("자하연식당>3층교직메뉴")
meal.set_restaurant("자하연식당 3층")

# 다음 한줄만 추가하는 경우
if not next_line_merged and self.is_next_line_keyword(last_meal):
last_meal = self.combine(last_meal, meal)
next_line_merged = True

else:
delimiter = self.get_multi_line_delimiter(last_meal)
# delimiter에 해당하는 경우에는 여기 걸림
Expand All @@ -219,7 +225,7 @@ def crawl(self, soup, **kwargs):
next_line_merged = False
elif self.get_multi_line_delimiter(last_meal) is None:
if meal.restaurant != restaurant:
meal = Meal(row_restaurant, name, date, types[col_idx])
meal = Meal(row_restaurant, name, date, meal_type)
meal = self.normalize(meal)
restaurant = meal.restaurant
self.found_meal(last_meal)
Expand Down
2 changes: 1 addition & 1 deletion crawlers/vet_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ async def run_30days(self):

def crawl(self, soup, **kwargs):
soup.div.extract()
trs = soup.select("table > tbody > tr")
trs = soup.select("table > thead > tr")

types = [th.text for th in trs[0].find_all("th")[1:]]

Expand Down
1 change: 0 additions & 1 deletion handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,6 @@ def menus_transaction(crawled_meals, cursor):
"""
cursor.execute(get_menus_query)
db_menus = cursor.fetchall()

new_menus, deleted_menus, edited_menus = compare_menus(db_menus, crawled_meals, restaurants)

if deleted_menus:
Expand Down

0 comments on commit 9bce8aa

Please sign in to comment.