diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 59e8072..870bf19 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,18 +1,18 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.3.0 + rev: v4.5.0 hooks: - id: check-yaml - id: end-of-file-fixer - id: trailing-whitespace - repo: https://github.com/codespell-project/codespell - rev: v2.2.2 + rev: v2.2.6 hooks: - id: codespell - repo: https://github.com/PyCQA/autoflake - rev: v1.7.7 + rev: v2.2.1 hooks: - id: autoflake name: autoflake @@ -21,7 +21,7 @@ repos: files: \.py$ - repo: https://github.com/asottile/reorder_python_imports - rev: v3.9.0 + rev: v3.12.0 hooks: - id: reorder-python-imports args: @@ -43,20 +43,20 @@ repos: exclude: static/.* - repo: https://github.com/asottile/pyupgrade - rev: v3.2.0 + rev: v3.15.0 hooks: - id: pyupgrade args: - --py37-plus - repo: https://github.com/asottile/add-trailing-comma - rev: v2.3.0 + rev: v3.1.0 hooks: - id: add-trailing-comma args: - --py36-plus - repo: https://github.com/psf/black - rev: "22.10.0" + rev: "23.9.1" hooks: - id: black diff --git a/README.md b/README.md index 574d501..64f5898 100644 --- a/README.md +++ b/README.md @@ -199,6 +199,8 @@ TRANSFER_POLICY = "if_not_present" # For PAN_BAIDU_BDUSS and PAN_BAIDU_COOKIES, please check the documentation of BaiduPCS-Py PAN_BAIDU_BDUSS = "" PAN_BAIDU_COOKIES = "" +# do not download file if path matches these regex +IGNORE_PATH_RE = ".*__MACOSX.*|.*spam.*" ## django settings # 0: production, 1: development diff --git a/baidupcsleecher/settings.py b/baidupcsleecher/settings.py index 8c7e954..f121bf2 100644 --- a/baidupcsleecher/settings.py +++ b/baidupcsleecher/settings.py @@ -67,6 +67,8 @@ TRANSFER_POLICY = getenv("TRANSFER_POLICY", "if_not_present") PAN_BAIDU_BDUSS = getenv("PAN_BAIDU_BDUSS", "") PAN_BAIDU_COOKIES = getenv("PAN_BAIDU_COOKIES", "") +# do not download these path +IGNORE_PATH_RE = getenv("IGNORE_PATH_RE", ".*__MACOSX.*|.*spam.*") REST_FRAMEWORK = { "DEFAULT_PAGINATION_CLASS": "drf_link_header_pagination.LinkHeaderPagination", diff --git a/task/baidupcs.py b/task/baidupcs.py index 4a2b9b7..69a6776 100644 --- a/task/baidupcs.py +++ b/task/baidupcs.py @@ -14,6 +14,7 @@ from .utils import cookies2dict from .utils import download_url +from .utils import match_regex from .utils import unify_shared_link logger = logging.getLogger("baibupcs") @@ -101,6 +102,9 @@ def download_dir(self, remote_dir, local_dir, sample_size=0): def download_file(self, remote_path, local_dir, file_size, sample_size=0): local_path = Path(local_dir) / basename(remote_path) logger.info(f" {remote_path} -> {local_path}") + if match_regex(str(remote_path), settings.IGNORE_PATH_RE): + logger.info(f" {remote_path} matched ignore paths, skipping") + return if not local_path.parent.exists(): local_path.parent.mkdir(parents=True) diff --git a/task/migrations/0001_initial.py b/task/migrations/0001_initial.py index 47f2c81..6e3d125 100644 --- a/task/migrations/0001_initial.py +++ b/task/migrations/0001_initial.py @@ -4,7 +4,6 @@ class Migration(migrations.Migration): - initial = True dependencies = [] diff --git a/task/migrations/0002_task_captcha_task_captcha_code_task_captcha_required.py b/task/migrations/0002_task_captcha_task_captcha_code_task_captcha_required.py index 7f90819..80076ee 100644 --- a/task/migrations/0002_task_captcha_task_captcha_code_task_captcha_required.py +++ b/task/migrations/0002_task_captcha_task_captcha_code_task_captcha_required.py @@ -4,7 +4,6 @@ class Migration(migrations.Migration): - dependencies = [ ("task", "0001_initial"), ] diff --git a/task/migrations/0011_alter_task_shared_id_alter_task_shared_link_and_more.py b/task/migrations/0011_alter_task_shared_id_alter_task_shared_link_and_more.py index 55b4e46..8c02664 100644 --- a/task/migrations/0011_alter_task_shared_id_alter_task_shared_link_and_more.py +++ b/task/migrations/0011_alter_task_shared_id_alter_task_shared_link_and_more.py @@ -4,7 +4,6 @@ class Migration(migrations.Migration): - dependencies = [ ("task", "0010_alter_task_full_download_now"), ] diff --git a/task/tests/test_baidupcs.py b/task/tests/test_baidupcs.py index 89b7a90..4edc1d3 100644 --- a/task/tests/test_baidupcs.py +++ b/task/tests/test_baidupcs.py @@ -23,6 +23,7 @@ def test_get_baidupcs_client(mock_cookies2dict, mock_settings, mock_BaiduPCS): mock_settings.PAN_BAIDU_BDUSS = "test_bduss" mock_settings.PAN_BAIDU_COOKIES = "test_cookies" mock_cookies2dict.return_value = {"test_cookies"} + mock_settings.IGNORE_PATH_RE = ".*__MACOSX.*|.*spam.*" get_baidupcs_client() @@ -229,3 +230,31 @@ def test_download(self, mock_download, mock_list): assert mock_download.called assert mock_download.call_count == 1 assert mock_download.call_args.args[0].name == "text.txt" + + @patch( + "task.baidupcs.BaiduPCSClient.list_files", + return_value=[ + { + "path": "file.txt", + "is_dir": False, + "is_file": True, + "size": 2, + "md5": "abcd", + }, + { + "path": "__MACOSX/text.txt", + "is_dir": False, + "is_file": True, + "size": 1024, + "md5": "badbeef", + }, + ], + ) + @patch("task.baidupcs.download_url", return_value=100) + def test_ignore_download(self, mock_download, mock_list): + with tempfile.TemporaryDirectory() as tmpdir: + self.client.download_dir("/", tmpdir, 100) + + assert mock_download.called + assert mock_download.call_count == 1 + assert mock_download.call_args.args[0].name == "file.txt" diff --git a/task/utils.py b/task/utils.py index 1a3d49b..4a90543 100644 --- a/task/utils.py +++ b/task/utils.py @@ -91,3 +91,24 @@ def download_url(local_path, url, headers, limit=0): if limit > 0 and total >= limit: return total return total + + +def match_regex(string: str, regex: str) -> bool: + """ + Check if a string matches a given regular expression. + + Args: + string (str): The input string. + regex (str): The regular expression pattern. + + Returns: + bool: True if the string matches the regular expression, False otherwise. + + Examples: + >>> match_regex("hello.txt", ".*txt|.*mp3") + True + >>> match_regex("hello.html", ".*txt|.*mp3") + False + """ + pattern = re.compile(regex) + return bool(re.match(pattern, string))