From 9c0dc2757f1038123c61dc5d655065755393671a Mon Sep 17 00:00:00 2001 From: Xie Yanbo Date: Tue, 7 Nov 2023 10:45:05 +0800 Subject: [PATCH] feat: purge useless data files --- README.md | 11 ++++ baidupcsleecher/settings.py | 2 +- task/models.py | 18 ++++--- task/purge.py | 48 +++++++++++++++++ task/serializers.py | 4 ++ task/tests/conftest.py | 20 +++++++ task/tests/test_api.py | 71 +++++++++++++++++++++++++ task/utils.py | 102 ++++++++++++++++++++++++++++++++++++ task/views.py | 36 ++++++++++--- 9 files changed, 296 insertions(+), 16 deletions(-) create mode 100644 task/purge.py create mode 100644 task/tests/conftest.py diff --git a/README.md b/README.md index 2d744f1..574f310 100644 --- a/README.md +++ b/README.md @@ -173,6 +173,17 @@ $ curl -X POST localhost:8000/task/${task_id}/restart_downloading/ ``` This simply restarts the download process for samples and full files, but skips the steps of saving and retrieving the file list. +### purge files of deleted leecher tasks + +After a long run, there will be a large number of files of deleted leecher tasks. You may want to delete files that you no longer need, you can call the purge api to delete them: +```sh +$ curl -X POST localhost:8000/task/purge/ +``` +By default, deleted files are moved to the trash folder: `baidupcsleecher_trash`, you have to delete them manually. If you want to delete the file completely, set the parameter `move_to_trash=false`: +```sh +$ curl -X POST -d "move_to_trash=false" localhost:8000/task/purge/ +``` + ## simple ui You can also directly use the browser to access the simple web interface that comes with the service, submit download tasks, and view the task list. diff --git a/baidupcsleecher/settings.py b/baidupcsleecher/settings.py index b4eca1f..abd2101 100644 --- a/baidupcsleecher/settings.py +++ b/baidupcsleecher/settings.py @@ -57,7 +57,7 @@ INSTALLED_APPS.append("django_browser_reload") # baidupcsleecher settings -DATA_DIR = Path(getenv("DATA_DIR", "/tmp")).resolve() +DATA_DIR = Path(getenv("DATA_DIR", "/tmp/baidupcsleecher")).resolve() REMOTE_LEECHER_DIR = str(Path(getenv("REMOTE_LEECHER_DIR", "/leecher")).resolve()) RUNNER_SLEEP_SECONDS = int(getenv("RUNNER_SLEEP_SECONDS", "5")) SAMPLE_SIZE = int(getenv("SAMPLE_SIZE", "10240")) diff --git a/task/models.py b/task/models.py index 685d6d9..ea13f93 100644 --- a/task/models.py +++ b/task/models.py @@ -98,30 +98,34 @@ class Meta: def __repr__(self): return f"" - def __str__(self): + def __str__(self) -> str: return repr(self) @property - def path(self): + def path(self) -> str: return f"{self.shared_id}.{self.shared_password}" @property - def sample_path(self): + def sample_path(self) -> str: return f"{self.path}.sample" @property - def data_path(self): + def data_path(self) -> Path: return settings.DATA_DIR / self.path - def ensure_data_path(self): + @property + def sample_data_path(self) -> Path: + return settings.DATA_DIR / self.sample_path + + def ensure_data_path(self) -> None: if not self.data_path.exists(): makedirs(self.data_path, exists_ok=True) @property - def remote_path(self): + def remote_path(self) -> str: return str(Path(settings.REMOTE_LEECHER_DIR) / self.path) - def set_files(self, files): + def set_files(self, files) -> None: remote_base_dir = str(Path(settings.REMOTE_LEECHER_DIR) / self.path) file_list = [] for file in files: diff --git a/task/purge.py b/task/purge.py new file mode 100644 index 0000000..78d1f85 --- /dev/null +++ b/task/purge.py @@ -0,0 +1,48 @@ +import os +import shutil +from pathlib import Path +from typing import List + +from django.conf import settings + +from .models import Task + + +def purge(move_to_dir: Path = None) -> None: + keep_dirs = set() + for task in Task.objects.all(): + keep_dirs.add(task.data_path) + keep_dirs.add(task.sample_data_path) + + exist_dirs = set() + root = settings.DATA_DIR + if root.exists(): + for dir in os.listdir(root): + path = root / dir + if path.is_dir() and len(dir.split(".")) in [2, 3]: + exist_dirs.add(root / dir) + + useless = exist_dirs - keep_dirs + print(f"{len(useless)} directories to be deleted.") + + for dir in sorted(useless): + if not dir.exists(): + print(f"{dir} is not exists, skip deletion.") + continue + if move_to_dir: + print(f"start move {dir} to trash dir {move_to_dir} ...") + to_dir = move_to_dir / dir.name + to_dir.parent.mkdir(parents=True, exist_ok=True) + os.rename(dir, move_to_dir / dir.name) + print(f"{dir} moved to trash dir.") + else: + print(f"start delete {dir} ...") + shutil.rmtree(dir) + print(f"{dir} deleted.") + + +def remove_tasks(keep_task_ids: List[int] = []) -> List[int]: + if keep_task_ids: + to_remove = Task.objects.exclude(id__in=keep_task_ids) + to_remove.delete() + return Task.objects.all().values_list("id", flat=True) diff --git a/task/serializers.py b/task/serializers.py index a19a547..11d4629 100644 --- a/task/serializers.py +++ b/task/serializers.py @@ -60,5 +60,9 @@ class FullDownloadNowSerializer(serializers.Serializer): full_download_now = serializers.BooleanField() +class PurgeSerializer(serializers.Serializer): + move_to_trash = serializers.BooleanField(default=True) + + class OperationSerializer(serializers.Serializer): pass diff --git a/task/tests/conftest.py b/task/tests/conftest.py new file mode 100644 index 0000000..95aa113 --- /dev/null +++ b/task/tests/conftest.py @@ -0,0 +1,20 @@ +from pathlib import Path + +import pytest +from django.conf import settings + +from ..utils import walk_files + + +@pytest.fixture(autouse=True) +def data_dir_setup(tmp_path: Path): + settings.DATA_DIR = tmp_path + + yield + + print(f"files in tmp_path {tmp_path}:") + total = 0 + for path in walk_files(tmp_path): + print(f" {path}") + total += 1 + print(f"total: {total} files") diff --git a/task/tests/test_api.py b/task/tests/test_api.py index d36272f..794aa18 100644 --- a/task/tests/test_api.py +++ b/task/tests/test_api.py @@ -1,19 +1,36 @@ +from pathlib import Path from unittest.mock import MagicMock from unittest.mock import Mock from unittest.mock import patch +from django.conf import settings from django.urls import reverse from rest_framework import status from rest_framework.test import APITestCase from ..models import Task from ..serializers import TaskSerializer +from ..utils import list_files + + +def touch_file(path: Path): + path.parent.mkdir(parents=True, exist_ok=True) + open(path, "w").write("") + + +def touch_task_files(task: Task): + for f in task.load_files(): + if f["is_file"]: + touch_file(path=task.data_path / f["path"]) + touch_file(path=task.sample_data_path / f["path"]) class TaskViewSetTestCase(APITestCase): def setUp(self): self.task = Task.objects.create( shared_link="https://pan.baidu.com/s/123abc?pwd=def", + shared_id="123abc", + shared_password="def", ) self.remote_files = [ { @@ -247,9 +264,13 @@ def test_delete_remote_files(self, mock_get_baidupcs_client): def test_delete_local_files(self): id = self.task.id + touch_file(path=self.task.data_path / "test.txt") + assert list_files(settings.DATA_DIR) != [] + response = self.client.delete(reverse("task-local-files", args=[id])) assert response.json() == {str(id): "local files deleted"} + assert list_files(settings.DATA_DIR) == [] @patch("task.views.get_baidupcs_client") def test_erase(self, mock_get_baidupcs_client): @@ -259,3 +280,53 @@ def test_erase(self, mock_get_baidupcs_client): response = self.client.delete(reverse("task-erase", args=[id])) assert response.json() == {str(id): "task deleted"} + assert len(Task.objects.filter(pk=id)) == 0 + + def test_purge(self): + touch_task_files(self.task) + self.task.delete() + assert sorted(list_files(settings.DATA_DIR)) == [ + "123abc.def.sample/张楚/孤独的人是可耻的.mp3", + "123abc.def.sample/张楚/蚂蚁蚂蚁.mp3", + "123abc.def/张楚/孤独的人是可耻的.mp3", + "123abc.def/张楚/蚂蚁蚂蚁.mp3", + ] + + response = self.client.post(reverse("task-purge")) + + assert response.json() == {"done": True} + assert sorted(list_files(settings.DATA_DIR)) == [ + "baidupcsleecher_trash/123abc.def.sample/张楚/孤独的人是可耻的.mp3", + "baidupcsleecher_trash/123abc.def.sample/张楚/蚂蚁蚂蚁.mp3", + "baidupcsleecher_trash/123abc.def/张楚/孤独的人是可耻的.mp3", + "baidupcsleecher_trash/123abc.def/张楚/蚂蚁蚂蚁.mp3", + ] + + def test_purge_all(self): + touch_task_files(self.task) + self.task.delete() + assert list_files(settings.DATA_DIR) != [] + + response = self.client.post( + reverse("task-purge"), + data={"move_to_trash": False}, + format="json", + ) + + assert response.json() == {"done": True} + assert list_files(settings.DATA_DIR) == [] + + def test_purge_nothing(self): + touch_task_files(self.task) + files = sorted(list_files(settings.DATA_DIR)) + assert files == [ + "123abc.def.sample/张楚/孤独的人是可耻的.mp3", + "123abc.def.sample/张楚/蚂蚁蚂蚁.mp3", + "123abc.def/张楚/孤独的人是可耻的.mp3", + "123abc.def/张楚/蚂蚁蚂蚁.mp3", + ] + + response = self.client.post(reverse("task-purge")) + + assert response.json() == {"done": True} + assert sorted(list_files(settings.DATA_DIR)) == files diff --git a/task/utils.py b/task/utils.py index 4a90543..36e41c1 100644 --- a/task/utils.py +++ b/task/utils.py @@ -1,7 +1,11 @@ import logging +import os import re import traceback from http.cookies import SimpleCookie +from pathlib import Path +from typing import List +from typing import Tuple from urllib.parse import parse_qs from urllib.parse import urlparse @@ -112,3 +116,101 @@ def match_regex(string: str, regex: str) -> bool: """ pattern = re.compile(regex) return bool(re.match(pattern, string)) + + +def walk_dir(path: Path) -> List[Tuple[Path, List[os.DirEntry]]]: + """ + Recursively walks through a directory and yields tuples containing + the current path and a list of directory entries. + + Args: + path (Path): The path to the directory. + + Returns: + List[Tuple[Path, List[os.DirEntry]]]: A list of tuples containing + the current path and a list of directory entries. + + Examples: + >>> import tempfile + >>> with tempfile.TemporaryDirectory() as temp_dir: + ... test_dir = Path(temp_dir) / "test_dir" + ... test_dir.mkdir() + ... file1 = test_dir / "file1.txt" + ... file1.touch() + ... sub_dir = test_dir / "sub_dir" + ... sub_dir.mkdir() + ... file2 = sub_dir / "file2.txt" + ... file2.touch() + ... entries = list(walk_dir(test_dir)) + ... len(entries) + 2 + >>> entries[0][0] == test_dir + True + >>> sorted([i.name for i in entries[0][1]]) + ['file1.txt', 'sub_dir'] + >>> entries[1][0] == sub_dir + True + >>> sorted([i.name for i in entries[1][1]]) + ['file2.txt'] + """ + + paths = [path] + while paths: + path = paths.pop(0) + with os.scandir(path) as scandir_it: + entries = list(scandir_it) + yield path, entries + for entry in entries: + if entry.is_dir(): + paths.append(path._make_child_relpath(entry.name)) + + +def walk_files(path: Path) -> List[Path]: + """ + >>> import tempfile + >>> with tempfile.TemporaryDirectory() as temp_dir: + ... test_dir = Path(temp_dir) / "test_dir" + ... test_dir.mkdir() + ... file1 = test_dir / "file1.txt" + ... file1.touch() + ... sub_dir = test_dir / "sub_dir" + ... sub_dir.mkdir() + ... file2 = sub_dir / "file2.txt" + ... file2.touch() + ... files = list(walk_files(test_dir)) + ... len(files) + 2 + >>> [i.name for i in files] + ['file1.txt', 'file2.txt'] + """ + for root, entries in walk_dir(path): + for p in entries: + if not p.is_dir(): + yield root / p + + +def list_files(root: Path, without_root=True) -> List[str]: + """ + >>> import tempfile + >>> with tempfile.TemporaryDirectory() as temp_dir: + ... test_dir = Path(temp_dir) / "test_dir" + ... test_dir.mkdir() + ... file1 = test_dir / "file1.txt" + ... file1.touch() + ... sub_dir = test_dir / "sub_dir" + ... sub_dir.mkdir() + ... file2 = sub_dir / "file2.txt" + ... file2.touch() + ... files = list_files(test_dir) + ... len(files) + 2 + >>> files + ['file1.txt', 'sub_dir/file2.txt'] + """ + result = [] + for file_path in walk_files(root): + if without_root: + result.append(str(file_path.relative_to(root))) + else: + result.append(str(file_path)) + return result diff --git a/task/views.py b/task/views.py index 928a99a..3ae22ca 100644 --- a/task/views.py +++ b/task/views.py @@ -2,6 +2,7 @@ from io import BytesIO from baidupcs_py.baidupcs import BaiduPCSError +from django.conf import settings from django.http import HttpResponse from django_filters import rest_framework as filters from rest_framework import mixins @@ -13,9 +14,11 @@ from .baidupcs import get_baidupcs_client from .leecher import transfer from .models import Task +from .purge import purge from .serializers import CaptchaCodeSerializer from .serializers import FullDownloadNowSerializer from .serializers import OperationSerializer +from .serializers import PurgeSerializer from .serializers import TaskSerializer logger = logging.getLogger(__name__) @@ -143,11 +146,28 @@ def erase(self, request, pk=None): except BaiduPCSError: return Response({task_id: message}) - def get_serializer(self, *args, **kwargs): - if self.action == "captcha_code": - return CaptchaCodeSerializer(*args, **kwargs) - if self.action == "full_download_now": - return FullDownloadNowSerializer(*args, **kwargs) - if self.action in ["restart", "restart_downloading", "resume", "erase"]: - return OperationSerializer(*args, **kwargs) - return super().get_serializer(*args, **kwargs) + @action( + methods=["post"], + detail=False, + name="Clearing local files of deleted tasks", + ) + def purge(self, request): + serializer = self.get_serializer_class()(data=request.data) + serializer.is_valid(raise_exception=True) + if serializer.validated_data["move_to_trash"]: + purge(settings.DATA_DIR / "baidupcsleecher_trash") + else: + purge() + return Response({"done": True}) + + def get_serializer_class(self): + serializer_classes = { + "captcha_code": CaptchaCodeSerializer, + "full_download_now": FullDownloadNowSerializer, + "purge": PurgeSerializer, + "restart": OperationSerializer, + "restart_downloading": OperationSerializer, + "resume": OperationSerializer, + "erase": OperationSerializer, + } + return serializer_classes.get(self.action, self.serializer_class)