fix index error (#71)

* fix index error * fix: 더이상 ths내 type 존재 않음 (#72) * Fix vet crawler * 두레미담 주문식 메뉴 수정 & 자하연 식당 이름 수정 --------- Co-authored-by: iwin1203 <[email protected]>
wafflestudio · Feb 26, 2024 · 9bce8aa · 9bce8aa
1 parent 88d4626
commit 9bce8aa
Show file tree

Hide file tree

Showing 4 changed files with 38 additions and 21 deletions.
diff --git a/crawlers/base_crawler.py b/crawlers/base_crawler.py
@@ -42,7 +42,19 @@ class Meal:
     BR = "BR"
     LU = "LU"
     DN = "DN"
-    type_handler = {BR: BR, LU: LU, DN: DN, "아침": BR, "점심": LU, "저녁": DN, "중식": LU, "석식": DN}
+    type_handler = {
+        BR: BR,
+        LU: LU,
+        DN: DN,
+        "아침": BR,
+        "점심": LU,
+        "저녁": DN,
+        "중식": LU,
+        "석식": DN,
+        "breakfast": BR,
+        "lunch": LU,
+        "dinner": DN,
+    }
 
     def __init__(self, restaurant="", name="", date=None, type="", price=None, etc=None):
         self.set_restaurant(restaurant)
@@ -171,7 +183,7 @@ async def run(self, url=None, **kwargs):
                     soup = BeautifulSoup(html, "html.parser")
                     self.crawl(soup, **kwargs)
                 except Exception as e:
-                    print(e)
+                    print(f"Error in Run: {str(e)}")
 
     def normalize(self, meal, **kwargs):
         for normalizer_cls in self.normalizer_classes:
@@ -180,7 +192,7 @@ def normalize(self, meal, **kwargs):
 
     def is_meal_name_when_normalized(self, name):
         normalized_name = text_normalizer(name, True)
-        if not normalized_name:
+        if not normalized_name or normalized_name == "메뉴":
             return False
         is_meal_name = all(re.match(".*" + p + ".*", normalized_name) is None for p in self.not_meal)
         return is_meal_name

diff --git a/crawlers/snuco_crawler.py b/crawlers/snuco_crawler.py
@@ -75,7 +75,17 @@ class SnucoRestaurantCrawler(RestaurantCrawler):
 
     def __init__(self):
         super().__init__()
-        self.not_meal += ["셋트메뉴", "단품메뉴", "사이드메뉴", "결제", "혼잡시간", r"말렌카케이크", "1조각홀케이크", "식사", "메뉴", "사이드"]
+        self.not_meal += [
+            "셋트메뉴",
+            "단품메뉴",
+            "사이드메뉴",
+            "결제",
+            "혼잡시간",
+            r"말렌카케이크",
+            "1조각홀케이크",
+            "식사",
+            "사이드",
+        ]
 
     def is_next_line_keyword(self, meal):
         if not meal:
@@ -133,10 +143,7 @@ async def run_30days(self):
     async def run(self, date=None, **kwargs):
         if not date:
             date = datetime.datetime.now(timezone("Asia/Seoul")).date()
-        url = (
-            self.url
-            + f"?field_menu_date_value_1%5Bvalue%5D%5Bdate%5D=&field_menu_date_value%5Bvalue%5D%5Bdate%5D={date.month}%2F{date.day}%2F{date.year}"
-        )
+        url = self.url + f"?date={date.year}-{date.month:02d}-{date.day:02d}"
         await super().run(url, date=date, **kwargs)
 
     def found_meal(self, meal):
@@ -145,17 +152,11 @@ def found_meal(self, meal):
 
     def crawl(self, soup, **kwargs):
         date = kwargs.get("date", datetime.datetime.now(timezone("Asia/Seoul")).date())
-        table = soup.select_one("div.view-content > table")
+        table = soup.find("table", {"class": "menu-table"})
         if not table:
             return
-
-        ths = table.select("thead > tr > th")
         trs = table.tbody.find_all("tr", recursive=False)
 
-        types = []
-        for th in ths[1:]:
-            types.append(th.text)
-
         for tr in trs:
             tds = tr.find_all("td", recursive=False)
             row_restaurant = tds[0].text
@@ -164,7 +165,11 @@ def crawl(self, soup, **kwargs):
                 for except_restaurant_name in self.except_restaurant_name_list
             ):
                 continue
+
             for col_idx, td in enumerate(tds[1:]):
+                # meal type이 더 이상 ths에 포함되지 않고 tds 내부로 이동.
+                meal_type = td["class"][0]
+
                 # td.text에서 식단을 한번에 가져오는 것으로 변경
                 names = td.text.split("\n")
                 restaurant = text_normalizer(row_restaurant)
@@ -177,7 +182,7 @@ def crawl(self, soup, **kwargs):
                     filtered_names = self.filter_menu_names(names)
 
                 for name in filtered_names:
-                    meal = Meal(restaurant, name, date, types[col_idx])
+                    meal = Meal(restaurant, name, date, meal_type)
                     meal = self.normalize(meal)
 
                     if self.is_meal_name_when_normalized(meal.name):
@@ -191,16 +196,17 @@ def crawl(self, soup, **kwargs):
 
                         # 교직원 식당 이름 설정을 위한 로직
                         if (
-                            meal.restaurant == "자하연식당"
+                            (meal.restaurant == "자하연식당 3층" or "자하연식당" in meal.restaurant)
                             and last_meal
                             and ("교직" in last_meal.name or "교직" in last_meal.restaurant)
                         ) or meal.restaurant in self.jaha_faculty_keyword:
-                            meal.set_restaurant("자하연식당>3층교직메뉴")
+                            meal.set_restaurant("자하연식당 3층")
 
                         # 다음 한줄만 추가하는 경우
                         if not next_line_merged and self.is_next_line_keyword(last_meal):
                             last_meal = self.combine(last_meal, meal)
                             next_line_merged = True
+
                         else:
                             delimiter = self.get_multi_line_delimiter(last_meal)
                             # delimiter에 해당하는 경우에는 여기 걸림
@@ -219,7 +225,7 @@ def crawl(self, soup, **kwargs):
                             next_line_merged = False
                     elif self.get_multi_line_delimiter(last_meal) is None:
                         if meal.restaurant != restaurant:
-                            meal = Meal(row_restaurant, name, date, types[col_idx])
+                            meal = Meal(row_restaurant, name, date, meal_type)
                             meal = self.normalize(meal)
                             restaurant = meal.restaurant
                         self.found_meal(last_meal)

diff --git a/crawlers/vet_crawler.py b/crawlers/vet_crawler.py
@@ -11,7 +11,7 @@ async def run_30days(self):
 
     def crawl(self, soup, **kwargs):
         soup.div.extract()
-        trs = soup.select("table > tbody > tr")
+        trs = soup.select("table > thead > tr")
 
         types = [th.text for th in trs[0].find_all("th")[1:]]
 

diff --git a/handler.py b/handler.py
@@ -109,7 +109,6 @@ def menus_transaction(crawled_meals, cursor):
     """
     cursor.execute(get_menus_query)
     db_menus = cursor.fetchall()
-
     new_menus, deleted_menus, edited_menus = compare_menus(db_menus, crawled_meals, restaurants)
 
     if deleted_menus: