Fix: 자하연 2023-2 식단 (#52)

* hotfix: convert db port type to int (#40) * fix: dev db host name * Refactor: delete serverless resources (#42) * refactor: Remove serverless files * doc: Edit deploy and test description * feat: Add environment in workflow files * fix: gitignore * feat: initialize poetry (#44) * Use poetry instead of requirements.txt * Fix Dockerfile & Add .venv to ignore files * Fix PATH env in Dockerfile * Delete expose in Dockerfile * Update pyproject.toml * fix: pip root user action error (#45) * fix: remove --mount from Dockerfile (#46) * fix: Dockerfile syntax (#47) * feat: update README.md & restrict Python version to 3.10 (#48) * fix: use tilde requirements for python version * Update README.md * Update README.md * Configure black & pylint * Apply black * Fix python setup in gh workflow * Use poetry instead of pip in gh workflow * Fix python version in gh workflow * Fix pyproject.toml * Update Makefile * Fix: 자하연 메뉴 오류 (#51) * refactor: Remove wildcard import * fix: local db config * feat: Add split logic for jaha restaurant menus * fix: lint --------- Co-authored-by: Jaewan Park <[email protected]>
wafflestudio · Oct 16, 2023 · e7e587e · e7e587e
1 parent 43e91d0
commit e7e587e
Show file tree

Hide file tree

Showing 19 changed files with 1,584 additions and 295 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,35 @@
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+.env
+node_modules
+package.json
+package-lock.json
+
+# Serverless directories
+.serverless
+
+# mac
+.DS_Store
+venv
+.venv
+
+# JetBrains
+.idea
+
+# Python
+__pycache__/
diff --git a/.github/workflows/ecr-dev.yml b/.github/workflows/ecr-dev.yml
@@ -13,6 +13,7 @@ jobs:
   deploy:
     name: deploy dev 
     runs-on: ubuntu-latest
+    environment: development
 
     steps:
       - name: Checkout
@@ -43,7 +44,7 @@ jobs:
           # push it to ECR so that it can
           # be deployed to ECS.
           docker build --platform linux/amd64 -t $ECR_REGISTRY/$ECR_REPOSITORY_K8S:$IMAGE_TAG . \
-                      --build-arg DB_HOST=${{ secrets.DB_HOST }}              \
+                      --build-arg DB_HOST=${{ secrets.DB_HOST_DEV }}              \
                       --build-arg DB_NAME=${{ secrets.DB_NAME_DEV }}              \
                       --build-arg DB_USER=${{ secrets.DB_USER_DEV }}              \
                       --build-arg DB_PASSWORD=${{ secrets.DB_PASSWORD_DEV }}      \

diff --git a/.github/workflows/ecr-prod.yml b/.github/workflows/ecr-prod.yml
@@ -13,6 +13,7 @@ jobs:
   deploy:
     name: deploy prod
     runs-on: ubuntu-latest
+    environment: production
 
     steps:
       - name: Checkout

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -0,0 +1,23 @@
+name: Lint
+
+on: [push]
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - name: checkout
+        uses: actions/checkout@v3
+      - name: install poetry
+        run: pipx install poetry
+      - name: use python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+          cache: poetry
+      - name: install black, pylint
+        run: poetry install --only dev
+      - name: check if code linted
+        run: |
+          poetry run black --check $(git ls-files '*.py')
+          poetry run pylint --recursive=yes $(git ls-files '*.py') || true
diff --git a/.gitignore b/.gitignore
@@ -16,19 +16,14 @@ var/
 .installed.cfg
 *.egg
 .env
-node_modules
-package.json
-package-lock.json
-
-# Serverless directories
-.serverless
 
 # mac
 .DS_Store
 venv
+.venv
 
 # JetBrains
 .idea
 
 # Python
-__pycache__/
+__pycache__/
diff --git a/.nvmrc b/.nvmrc
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,29 @@
-FROM python:3.10
+# ----- python-base ----- #
+FROM python:3.10.11-slim-bullseye AS python-base
+
+ENV PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PIP_NO_CACHE_DIR=off \
+    PIP_DISABLE_PIP_VERSION_CHECK=on \
+    PIP_DEFAULT_TIMEOUT=100 \
+    PIP_ROOT_USER_ACTION=ignore \
+    POETRY_VIRTUALENVS_IN_PROJECT=1 \
+    POETRY_VIRTUALENVS_CREATE=1 \
+    POETRY_NO_INTERACTION=1
+
+# ----- builder-base ----- #
+FROM python-base AS builder-base
+
+RUN pip install --upgrade pip && pip install poetry==1.5.0
+
+WORKDIR /app
+
+COPY poetry.lock pyproject.toml ./
+
+RUN poetry install --without dev
+
+# ----- runtime ----- #
+FROM python-base AS runtime
 
 ARG DB_HOST
 ARG DB_NAME
@@ -7,19 +32,20 @@ ARG DB_PASSWORD
 ARG SLACK_TOKEN
 ARG SLACK_CHANNEL
 
-ENV DB_HOST $DB_HOST
-ENV DB_PORT 3306
-ENV DB_NAME $DB_NAME
-ENV DB_USER $DB_USER
-ENV DB_PASSWORD $DB_PASSWORD
-ENV SLACK_TOKEN $SLACK_TOKEN
-ENV SLACK_CHANNEL $SLACK_CHANNEL
+ENV DB_HOST=$DB_HOST \
+    DB_NAME=$DB_NAME \
+    DB_USER=$DB_USER \
+    DB_PASSWORD=$DB_PASSWORD \
+    SLACK_TOKEN=$SLACK_TOKEN \
+    SLACK_CHANNEL=$SLACK_CHANNEL \
+    VENV_PATH="/app/.venv"
+
+ENV PATH="$VENV_PATH/bin:$PATH"
+
+COPY --from=builder-base $VENV_PATH $VENV_PATH
 
-COPY . /siksha-crawler
-WORKDIR /siksha-crawler
+WORKDIR /app
 
-COPY requirements.txt requirements.txt
-RUN pip install --upgrade pip
-RUN pip install -r requirements.txt
+COPY . .
 
 CMD ["python", "handler.py"]
diff --git a/Makefile b/Makefile
@@ -0,0 +1,9 @@
+.PHONY: default
+default:
+	@echo "Specify the target"
+	@exit 1
+
+.PHONY: lint
+lint:
+	black --check .
+	pylint --recursive=yes .
diff --git a/README.md b/README.md
@@ -1,58 +1,62 @@
 # SIKSHA CRAWLER
+서울대학교 식단 관리 서비스 [**식샤**](https://siksha.wafflestudio.com/)의 웹 크롤러입니다.
+
+## See Also
+- [siksha-api](https://github.com/wafflestudio/siksha-api)
+
+## Requirements
+- Python 3.10
+- Poetry 1.5.0
+
+## Conventions
+
+### Branch & PR Rules
+GitHub Flow + Issue based branch 방식을 사용합니다.
+- GitHub Flow는 [여기](https://medium.com/@patrickporto/4-branching-workflows-for-git-30d0aaee7bf) 참고
+- 개발이 필요한 사항은 우선 issue에 올리고, 해당 issue 번호로 branch를 만듭니다.
+- 예시 브랜치) feat/14-crawling-debugging
+- new PR -> dev 브랜치로 merge -> dev 브랜치가 테스트 통과하면 prod 브랜치로 merge
+
+### Code Styles
+- [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html)
+- [Black](https://black.readthedocs.io/en/stable/)
+```shell
+# Check code styles using black and pylint
+make lint
+```
+```shell
+# Format all codes using black
+black .
+```
 
 ## Dev Guidelines
 
-- GitHub Flow + Issue based branch 방식을 사용합니다.
-  - GitHub Flow는 [여기](https://medium.com/@patrickporto/4-branching-workflows-for-git-30d0aaee7bf) 참고
-  - 개발이 필요한 사항은 우선 issue에 올리고, 해당 issue 번호로 branch를 만듭니다.
-  - 예시 브랜치) feat/14-crawling-debugging
-  - branch는 master에 머지 후 자동 삭제됩니다.
-- Severless deploy는 master 머지 이전에 진행되어야 합니다.
-
-  - master는 언제든 배포에 결함이 없는 상태여야 합니다.
-  - **branch에서 작업 완료 -> PR -> review가 완료 -> sls deploy -> 이상 없는지 확인 -> merge**
-
-- python formatter로 black을, linter로 pylint를 사용합니다.
-
-## Functionality
-
-- 식당과 메뉴들에 대한 정보를 정기적으로(새벽 5시) 크롤링하고 RDS siksha DB 에 반영
-- 크롤링 결과를 #siksha-noti 채널로 전송
-
-## Deploy & Test([참고자료](https://www.serverless.com/blog/serverless-python-packaging/))
-
-1. setting
-   1. docker 설치 & 실행
-   1. aws cli 설치 & configure 설정
-   1. node & npm 설치([반드시 node 버전 14 이하로 유지](https://github.com/serverless/serverless/issues/8794))
-   1. sudo npm install -g serverless
-   1. npm init -y
-   1. npm install --save serverless-python-requirements
-   1. export SLS_DEBUG=\*
-   1. serverless config credentials --overwrite --provider aws --key <키 정보 입력> --secret <시크릿 키 정보 입력>
-   1. serverless.yml 의 dockerizePip 설정을 주석을 참고하여 운영체제에 맞게 변경
-1. deploy
-   - serverless deploy
-1. test
-   - serverless invoke -f crawler --log
-
-## Dependency Management
+### Python Dependencies
+가상환경을 활성화하고 필요한 패키지를 설치합니다.
+```shell
+poetry shell
+poetry install
+```
+`pyproject.toml` 파일의 패키지 목록을 변경한 경우, 아래 명령을 통해 `poetry.lock` 파일을 최신화합니다.
+```shell
+poetry lock
+```
 
-- requirements.txt
+### Functionality
+식당과 메뉴들에 대한 정보는 정기적으로 (새벽 5시, dev 환경의 경우 매주 월요일만 진행) 크롤링 후 RDS siksha DB 에 반영됩니다.
+이후 크롤링 결과는 슬랙의 `#siksha-noti` (prod), `#siksha-noti-staging` (dev) 채널로 전송됩니다.
 
-## Crawler Debugging
+## Test
 
+### Crawler Debugging
+로컬에서 크롤러가 잘 동작하는지 확인하고 싶다면, 아래와 같이 실행합니다.
 ```
-$ python3 handler.py --restaurant {식당이름(일부)} --date 20221014
+python3 handler.py --restaurant {식당이름(일부)} --date 20221014
 ```
-
-> --restaurant (-r) 인자는 필수 <br>
-> --date (-d) 인자는 옵션. 연월일(20221106) 형식으로 date를 넣으면 그 날 식단만 나오고, 안쓰면 긁은거 다 나옴.
-
+> `--restaurant` (`-r`) 인자는 필수 <br>
+> `--date` (`-d`) 인자는 옵션. 연월일(20221106) 형식으로 date를 넣으면 그 날 식단만 나오고, 안쓰면 긁은거 다 나옴.
 - 주의) 크롤링 코드는 동일, 단순히 필터링해주는 방식임. 남용하면 서버에 부하줄 수 있음.
 - 주의) 예외처리 되어있지 않음. argument 잘못 줄 경우 에러 발생 가능성
-<<<<<<< Updated upstream
-=======
 
 ### Docker Build Test
 로컬에서 빌드가 잘 되는지 테스트하고 싶다면, 아래와 같이 실행합니다. ([GitHub Workflow](.github/workflows/ecr-dev.yml) 참고)
@@ -66,5 +70,3 @@ docker build -t {이미지이름} --build-arg {KEY}={VALUE}
 1. ECR에 이미지 태그 변화를 aws lambda에서 감지하여 waffle-world 레포의 이미지 버전이 업데이트
 1. Kubernetes 크론잡이 정해진 스케줄에 따라 식당 크롤링을 실행합니다.
 
-- 20230815 db 이전 (maria -> mysql)
->>>>>>> Stashed changes
diff --git a/crawlers/base_crawler.py b/crawlers/base_crawler.py
@@ -1,4 +1,4 @@
-from abc import *
+from abc import ABCMeta, abstractmethod
 import re
 import datetime
 from bs4 import BeautifulSoup
@@ -40,20 +40,9 @@ class Meal:
     BR = "BR"
     LU = "LU"
     DN = "DN"
-    type_handler = {
-        BR: BR,
-        LU: LU,
-        DN: DN,
-        "아침": BR,
-        "점심": LU,
-        "저녁": DN,
-        "중식": LU,
-        "석식": DN,
-    }
-
-    def __init__(
-        self, restaurant="", name="", date=None, type="", price=None, etc=None
-    ):
+    type_handler = {BR: BR, LU: LU, DN: DN, "아침": BR, "점심": LU, "저녁": DN, "중식": LU, "석식": DN}
+
+    def __init__(self, restaurant="", name="", date=None, type="", price=None, etc=None):
         self.set_restaurant(restaurant)
         self.set_name(name)
         self.set_date(date)
@@ -134,9 +123,7 @@ def normalize(self, meal, **kwargs):
 
 
 class RestaurantCrawler(metaclass=ABCMeta):
-    headers = {
-        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0"
-    }
+    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0"}
     url = ""
     normalizer_classes = []
     not_meal = [
@@ -163,16 +150,14 @@ def __init__(self):
         self.meals = []
 
     @abstractmethod
-    def run_30days(self):
+    async def run_30days(self):
         pass
 
     async def run(self, url=None, **kwargs):
         urllib3.disable_warnings()
         if url is None:
             url = self.url
-        async with aiohttp.ClientSession(
-            headers=self.headers, connector=aiohttp.TCPConnector(ssl=False)
-        ) as session:
+        async with aiohttp.ClientSession(headers=self.headers, connector=aiohttp.TCPConnector(ssl=False)) as session:
             async with session.get(url) as response:
                 try:
                     html = await response.read()
@@ -181,7 +166,6 @@ async def run(self, url=None, **kwargs):
                     self.crawl(soup, **kwargs)
                 except Exception as e:
                     print(e)
-                    pass
 
     def normalize(self, meal, **kwargs):
         for normalizer_cls in self.normalizer_classes:
@@ -192,9 +176,7 @@ def is_meal_name(self, name):
         name = text_normalizer(name, True)
         if not name:
             return False
-        return name and all(
-            re.match(".*" + p + ".*", name) is None for p in self.not_meal
-        )
+        return name and all(re.match(".*" + p + ".*", name) is None for p in self.not_meal)
 
     def found_meal(self, meal):
         if meal and self.is_meal_name(meal.name):