Skip to content

Commit

Permalink
refactor: 优化代码
Browse files Browse the repository at this point in the history
  • Loading branch information
NanmiCoder committed Jul 29, 2023
1 parent febbb13 commit 4ff2cf8
Show file tree
Hide file tree
Showing 17 changed files with 131 additions and 138 deletions.
6 changes: 4 additions & 2 deletions base/base_crawler.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
from abc import ABC, abstractmethod

from base.proxy_account_pool import AccountPool


class AbstractCrawler(ABC):
@abstractmethod
def init_config(self, **kwargs):
def init_config(self, platform: str, login_type: str, account_pool: AccountPool):
pass

@abstractmethod
async def start(self):
pass

@abstractmethod
async def search_posts(self):
async def search(self):
pass


Expand Down
2 changes: 1 addition & 1 deletion base/proxy_account_pool.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Tuple, Optional, List, Set
from typing import List, Optional, Set, Tuple

import config

Expand Down
2 changes: 1 addition & 1 deletion config/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from .base_config import *
from .account_config import *
from .base_config import *
from .db_config import *
4 changes: 2 additions & 2 deletions config/base_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
# save user data dir
USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name

# max page num
MAX_PAGE_NUM = 20
# crawler max notes count
CRAWLER_MAX_NOTES_COUNT = 20

# max concurrency num
MAX_CONCURRENCY_NUM = 10
4 changes: 1 addition & 3 deletions db.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
from tortoise import Tortoise
from tortoise import run_async
from tortoise import Tortoise, run_async

from config.db_config import *

from tools import utils


Expand Down
22 changes: 13 additions & 9 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import sys
import asyncio
import argparse
import asyncio
import sys

import db
import config
import db
from base import proxy_account_pool
from media_platform.douyin import DouYinCrawler
from media_platform.xhs import XiaoHongShuCrawler
Expand All @@ -17,14 +17,16 @@ def create_crawler(platform: str):
elif platform == "dy":
return DouYinCrawler()
else:
raise ValueError("Invalid Media Platform Currently only supported xhs or douyin ...")
raise ValueError("Invalid Media Platform Currently only supported xhs or dy ...")


async def main():
# define command line params ...
parser = argparse.ArgumentParser(description='Media crawler program.')
parser.add_argument('--platform', type=str, help='Media platform select (xhs|dy)...', default=config.PLATFORM)
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)', default=config.LOGIN_TYPE)
parser.add_argument('--platform', type=str, help='Media platform select (xhs|dy)', choices=["xhs", "dy"],
default=config.PLATFORM)
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)',
choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)

# init account pool
account_pool = proxy_account_pool.create_account_pool()
Expand All @@ -34,16 +36,18 @@ async def main():
await db.init_db()

args = parser.parse_args()
crawler = CrawlerFactory().create_crawler(platform=args.platform)
crawler = CrawlerFactory.create_crawler(platform=args.platform)
crawler.init_config(
command_args=args,
platform=args.platform,
login_type=args.lt,
account_pool=account_pool
)
await crawler.start()


if __name__ == '__main__':
try:
asyncio.run(main())
# asyncio.run(main())
asyncio.get_event_loop().run_until_complete(main())
except KeyboardInterrupt:
sys.exit()
16 changes: 8 additions & 8 deletions media_platform/douyin/client.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
import copy
import asyncio
from typing import Optional, Dict, Callable
import copy
import urllib.parse
from typing import Callable, Dict, Optional

import httpx
import execjs
import urllib.parse
from playwright.async_api import Page
from playwright.async_api import BrowserContext
import httpx
from playwright.async_api import BrowserContext, Page

from .field import *
from .exception import *
from tools import utils

from .exception import *
from .field import *


class DOUYINClient:
def __init__(
Expand Down
60 changes: 30 additions & 30 deletions media_platform/douyin/core.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,38 @@
import os
import asyncio
import os
from asyncio import Task
from argparse import Namespace
from typing import Optional, List, Dict, Tuple
from typing import Dict, List, Optional, Tuple

from playwright.async_api import async_playwright
from playwright.async_api import BrowserType
from playwright.async_api import BrowserContext
from playwright.async_api import Page
from playwright.async_api import (BrowserContext, BrowserType, Page,
async_playwright)

import config
from base.base_crawler import AbstractCrawler
from base.proxy_account_pool import AccountPool
from models import douyin
from tools import utils

from .client import DOUYINClient
from .exception import DataFetchError
from .login import DouYinLogin
from base.base_crawler import AbstractCrawler
from base.proxy_account_pool import AccountPool
from models import douyin


class DouYinCrawler(AbstractCrawler):
platform: str
login_type: str
context_page: Page
dy_client: DOUYINClient
account_pool: AccountPool
browser_context: BrowserContext

def __init__(self) -> None:
self.browser_context: Optional[BrowserContext] = None # type: ignore
self.context_page: Optional[Page] = None # type: ignore
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed
self.index_url = "https://www.douyin.com"
self.command_args: Optional[Namespace] = None # type: ignore
self.account_pool: Optional[AccountPool] = None # type: ignore

def init_config(self, **kwargs):
for key, value in kwargs.items():
setattr(self, key, value)
def init_config(self, platform: str, login_type: str, account_pool: AccountPool) -> None:
self.platform = platform
self.login_type = login_type
self.account_pool = account_pool

async def start(self) -> None:
account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info()
Expand All @@ -53,7 +53,7 @@ async def start(self) -> None:
self.dy_client = await self.create_douyin_client(httpx_proxy)
if not await self.dy_client.ping(browser_context=self.browser_context):
login_obj = DouYinLogin(
login_type=self.command_args.lt, # type: ignore
login_type=self.login_type,
login_phone=account_phone,
browser_context=self.browser_context,
context_page=self.context_page,
Expand All @@ -63,25 +63,25 @@ async def start(self) -> None:
await self.dy_client.update_cookies(browser_context=self.browser_context)

# search_posts
await self.search_posts()
await self.search()

utils.logger.info("Douyin Crawler finished ...")

async def search_posts(self) -> None:
async def search(self) -> None:
utils.logger.info("Begin search douyin keywords")
for keyword in config.KEYWORDS.split(","):
utils.logger.info(f"Current keyword: {keyword}")
aweme_list: List[str] = []
max_note_len = config.MAX_PAGE_NUM
dy_limite_count = 10 # douyin fixed limit page 10
page = 0
while max_note_len > 0:
while (page + 1) * dy_limite_count <= config.CRAWLER_MAX_NOTES_COUNT:
try:
posts_res = await self.dy_client.search_info_by_keyword(keyword=keyword, offset=page * 10)
posts_res = await self.dy_client.search_info_by_keyword(keyword=keyword,
offset=page * dy_limite_count)
except DataFetchError:
utils.logger.error(f"search douyin keyword: {keyword} failed")
break
page += 1
max_note_len -= 10
for post_item in posts_res.get("data"):
try:
aweme_info: Dict = post_item.get("aweme_info") or \
Expand All @@ -93,15 +93,15 @@ async def search_posts(self) -> None:
utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}")
await self.batch_get_note_comments(aweme_list)

async def batch_get_note_comments(self, aweme_list: List[str]):
async def batch_get_note_comments(self, aweme_list: List[str]) -> None:
task_list: List[Task] = []
_semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
for aweme_id in aweme_list:
task = asyncio.create_task(self.get_comments(aweme_id, _semaphore), name=aweme_id)
task = asyncio.create_task(self.get_comments(aweme_id, semaphore), name=aweme_id)
task_list.append(task)
await asyncio.wait(task_list)

async def get_comments(self, aweme_id: str, semaphore: "asyncio.Semaphore"):
async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore) -> None:
async with semaphore:
try:
await self.dy_client.get_aweme_all_comments(
Expand Down Expand Up @@ -155,7 +155,7 @@ async def launch_browser(
"""Launch browser and create browser context"""
if config.SAVE_LOGIN_STATE:
user_data_dir = os.path.join(os.getcwd(), "browser_data",
config.USER_DATA_DIR % self.command_args.platform) # type: ignore
config.USER_DATA_DIR % self.platform) # type: ignore
browser_context = await chromium.launch_persistent_context(
user_data_dir=user_data_dir,
accept_downloads=True,
Expand All @@ -173,7 +173,7 @@ async def launch_browser(
)
return browser_context

async def close(self):
async def close(self) -> None:
"""Close browser context"""
await self.browser_context.close()
utils.logger.info("Browser context closed ...")
17 changes: 6 additions & 11 deletions media_platform/douyin/login.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,17 @@
import sys
import asyncio
import functools
import sys
from typing import Optional

import aioredis
from tenacity import (
retry,
stop_after_attempt,
wait_fixed,
retry_if_result,
RetryError
)
from playwright.async_api import Page, TimeoutError as PlaywrightTimeoutError
from playwright.async_api import BrowserContext
from playwright.async_api import BrowserContext, Page
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
wait_fixed)

import config
from tools import utils
from base.base_crawler import AbstractLogin
from tools import utils


class DouYinLogin(AbstractLogin):
Expand Down
14 changes: 7 additions & 7 deletions media_platform/xhs/client.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
import json
import asyncio
from typing import Optional, Dict
import json
from typing import Dict, Optional

import httpx
from playwright.async_api import Page
from playwright.async_api import BrowserContext
from playwright.async_api import BrowserContext, Page

from .help import sign, get_search_id
from .field import SearchSortType, SearchNoteType
from .exception import DataFetchError, IPBlockError
from tools import utils

from .exception import DataFetchError, IPBlockError
from .field import SearchNoteType, SearchSortType
from .help import get_search_id, sign


class XHSClient:
def __init__(
Expand Down
Loading

0 comments on commit 4ff2cf8

Please sign in to comment.