Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: ignore paths #87

Merged
merged 2 commits into from
Oct 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.3.0
rev: v4.5.0
hooks:
- id: check-yaml
- id: end-of-file-fixer
- id: trailing-whitespace

- repo: https://github.com/codespell-project/codespell
rev: v2.2.2
rev: v2.2.6
hooks:
- id: codespell

- repo: https://github.com/PyCQA/autoflake
rev: v1.7.7
rev: v2.2.1
hooks:
- id: autoflake
name: autoflake
Expand All @@ -21,7 +21,7 @@ repos:
files: \.py$

- repo: https://github.com/asottile/reorder_python_imports
rev: v3.9.0
rev: v3.12.0
hooks:
- id: reorder-python-imports
args:
Expand All @@ -43,20 +43,20 @@ repos:
exclude: static/.*

- repo: https://github.com/asottile/pyupgrade
rev: v3.2.0
rev: v3.15.0
hooks:
- id: pyupgrade
args:
- --py37-plus

- repo: https://github.com/asottile/add-trailing-comma
rev: v2.3.0
rev: v3.1.0
hooks:
- id: add-trailing-comma
args:
- --py36-plus

- repo: https://github.com/psf/black
rev: "22.10.0"
rev: "23.9.1"
hooks:
- id: black
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,8 @@ TRANSFER_POLICY = "if_not_present"
# For PAN_BAIDU_BDUSS and PAN_BAIDU_COOKIES, please check the documentation of BaiduPCS-Py
PAN_BAIDU_BDUSS = ""
PAN_BAIDU_COOKIES = ""
# do not download file if path matches these regex
IGNORE_PATH_RE = ".*__MACOSX.*|.*spam.*"

## django settings
# 0: production, 1: development
Expand Down
2 changes: 2 additions & 0 deletions baidupcsleecher/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@
TRANSFER_POLICY = getenv("TRANSFER_POLICY", "if_not_present")
PAN_BAIDU_BDUSS = getenv("PAN_BAIDU_BDUSS", "")
PAN_BAIDU_COOKIES = getenv("PAN_BAIDU_COOKIES", "")
# do not download these path
IGNORE_PATH_RE = getenv("IGNORE_PATH_RE", ".*__MACOSX.*|.*spam.*")

REST_FRAMEWORK = {
"DEFAULT_PAGINATION_CLASS": "drf_link_header_pagination.LinkHeaderPagination",
Expand Down
4 changes: 4 additions & 0 deletions task/baidupcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from .utils import cookies2dict
from .utils import download_url
from .utils import match_regex
from .utils import unify_shared_link

logger = logging.getLogger("baibupcs")
Expand Down Expand Up @@ -101,6 +102,9 @@ def download_dir(self, remote_dir, local_dir, sample_size=0):
def download_file(self, remote_path, local_dir, file_size, sample_size=0):
local_path = Path(local_dir) / basename(remote_path)
logger.info(f" {remote_path} -> {local_path}")
if match_regex(str(remote_path), settings.IGNORE_PATH_RE):
logger.info(f" {remote_path} matched ignore paths, skipping")
return

if not local_path.parent.exists():
local_path.parent.mkdir(parents=True)
Expand Down
1 change: 0 additions & 1 deletion task/migrations/0001_initial.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@


class Migration(migrations.Migration):

initial = True

dependencies = []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@


class Migration(migrations.Migration):

dependencies = [
("task", "0001_initial"),
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@


class Migration(migrations.Migration):

dependencies = [
("task", "0010_alter_task_full_download_now"),
]
Expand Down
29 changes: 29 additions & 0 deletions task/tests/test_baidupcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def test_get_baidupcs_client(mock_cookies2dict, mock_settings, mock_BaiduPCS):
mock_settings.PAN_BAIDU_BDUSS = "test_bduss"
mock_settings.PAN_BAIDU_COOKIES = "test_cookies"
mock_cookies2dict.return_value = {"test_cookies"}
mock_settings.IGNORE_PATH_RE = ".*__MACOSX.*|.*spam.*"

get_baidupcs_client()

Expand Down Expand Up @@ -229,3 +230,31 @@ def test_download(self, mock_download, mock_list):
assert mock_download.called
assert mock_download.call_count == 1
assert mock_download.call_args.args[0].name == "text.txt"

@patch(
"task.baidupcs.BaiduPCSClient.list_files",
return_value=[
{
"path": "file.txt",
"is_dir": False,
"is_file": True,
"size": 2,
"md5": "abcd",
},
{
"path": "__MACOSX/text.txt",
"is_dir": False,
"is_file": True,
"size": 1024,
"md5": "badbeef",
},
],
)
@patch("task.baidupcs.download_url", return_value=100)
def test_ignore_download(self, mock_download, mock_list):
with tempfile.TemporaryDirectory() as tmpdir:
self.client.download_dir("/", tmpdir, 100)

assert mock_download.called
assert mock_download.call_count == 1
assert mock_download.call_args.args[0].name == "file.txt"
21 changes: 21 additions & 0 deletions task/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,24 @@ def download_url(local_path, url, headers, limit=0):
if limit > 0 and total >= limit:
return total
return total


def match_regex(string: str, regex: str) -> bool:
"""
Check if a string matches a given regular expression.

Args:
string (str): The input string.
regex (str): The regular expression pattern.

Returns:
bool: True if the string matches the regular expression, False otherwise.

Examples:
>>> match_regex("hello.txt", ".*txt|.*mp3")
True
>>> match_regex("hello.html", ".*txt|.*mp3")
False
"""
pattern = re.compile(regex)
return bool(re.match(pattern, string))