Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: purge useless data files #90

Merged
merged 1 commit into from
Nov 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,17 @@ $ curl -X POST localhost:8000/task/${task_id}/restart_downloading/
```
This simply restarts the download process for samples and full files, but skips the steps of saving and retrieving the file list.

### purge files of deleted leecher tasks

After a long run, there will be a large number of files of deleted leecher tasks. You may want to delete files that you no longer need, you can call the purge api to delete them:
```sh
$ curl -X POST localhost:8000/task/purge/
```
By default, deleted files are moved to the trash folder: `baidupcsleecher_trash`, you have to delete them manually. If you want to delete the file completely, set the parameter `move_to_trash=false`:
```sh
$ curl -X POST -d "move_to_trash=false" localhost:8000/task/purge/
```

## simple ui
You can also directly use the browser to access the simple web interface that comes with the service, submit download tasks, and view the task list.

Expand Down
2 changes: 1 addition & 1 deletion baidupcsleecher/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
INSTALLED_APPS.append("django_browser_reload")

# baidupcsleecher settings
DATA_DIR = Path(getenv("DATA_DIR", "/tmp")).resolve()
DATA_DIR = Path(getenv("DATA_DIR", "/tmp/baidupcsleecher")).resolve()
REMOTE_LEECHER_DIR = str(Path(getenv("REMOTE_LEECHER_DIR", "/leecher")).resolve())
RUNNER_SLEEP_SECONDS = int(getenv("RUNNER_SLEEP_SECONDS", "5"))
SAMPLE_SIZE = int(getenv("SAMPLE_SIZE", "10240"))
Expand Down
18 changes: 11 additions & 7 deletions task/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,30 +98,34 @@ class Meta:
def __repr__(self):
return f"<Task id={self.id}, {self.shared_id} with {self.total_files} files>"

def __str__(self):
def __str__(self) -> str:
return repr(self)

@property
def path(self):
def path(self) -> str:
return f"{self.shared_id}.{self.shared_password}"

@property
def sample_path(self):
def sample_path(self) -> str:
return f"{self.path}.sample"

@property
def data_path(self):
def data_path(self) -> Path:
return settings.DATA_DIR / self.path

def ensure_data_path(self):
@property
def sample_data_path(self) -> Path:
return settings.DATA_DIR / self.sample_path

def ensure_data_path(self) -> None:
if not self.data_path.exists():
makedirs(self.data_path, exists_ok=True)

@property
def remote_path(self):
def remote_path(self) -> str:
return str(Path(settings.REMOTE_LEECHER_DIR) / self.path)

def set_files(self, files):
def set_files(self, files) -> None:
remote_base_dir = str(Path(settings.REMOTE_LEECHER_DIR) / self.path)
file_list = []
for file in files:
Expand Down
48 changes: 48 additions & 0 deletions task/purge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import os
import shutil
from pathlib import Path
from typing import List

from django.conf import settings

from .models import Task


def purge(move_to_dir: Path = None) -> None:
keep_dirs = set()
for task in Task.objects.all():
keep_dirs.add(task.data_path)
keep_dirs.add(task.sample_data_path)

exist_dirs = set()
root = settings.DATA_DIR
if root.exists():
for dir in os.listdir(root):
path = root / dir
if path.is_dir() and len(dir.split(".")) in [2, 3]:
exist_dirs.add(root / dir)

useless = exist_dirs - keep_dirs
print(f"{len(useless)} directories to be deleted.")

for dir in sorted(useless):
if not dir.exists():
print(f"{dir} is not exists, skip deletion.")
continue
if move_to_dir:
print(f"start move {dir} to trash dir {move_to_dir} ...")
to_dir = move_to_dir / dir.name
to_dir.parent.mkdir(parents=True, exist_ok=True)
os.rename(dir, move_to_dir / dir.name)
print(f"{dir} moved to trash dir.")
else:
print(f"start delete {dir} ...")
shutil.rmtree(dir)
print(f"{dir} deleted.")


def remove_tasks(keep_task_ids: List[int] = []) -> List[int]:
if keep_task_ids:
to_remove = Task.objects.exclude(id__in=keep_task_ids)
to_remove.delete()
return Task.objects.all().values_list("id", flat=True)
4 changes: 4 additions & 0 deletions task/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,5 +60,9 @@ class FullDownloadNowSerializer(serializers.Serializer):
full_download_now = serializers.BooleanField()


class PurgeSerializer(serializers.Serializer):
move_to_trash = serializers.BooleanField(default=True)


class OperationSerializer(serializers.Serializer):
pass
20 changes: 20 additions & 0 deletions task/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from pathlib import Path

import pytest
from django.conf import settings

from ..utils import walk_files


@pytest.fixture(autouse=True)
def data_dir_setup(tmp_path: Path):
settings.DATA_DIR = tmp_path

yield

print(f"files in tmp_path {tmp_path}:")
total = 0
for path in walk_files(tmp_path):
print(f" {path}")
total += 1
print(f"total: {total} files")
71 changes: 71 additions & 0 deletions task/tests/test_api.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,36 @@
from pathlib import Path
from unittest.mock import MagicMock
from unittest.mock import Mock
from unittest.mock import patch

from django.conf import settings
from django.urls import reverse
from rest_framework import status
from rest_framework.test import APITestCase

from ..models import Task
from ..serializers import TaskSerializer
from ..utils import list_files


def touch_file(path: Path):
path.parent.mkdir(parents=True, exist_ok=True)
open(path, "w").write("")


def touch_task_files(task: Task):
for f in task.load_files():
if f["is_file"]:
touch_file(path=task.data_path / f["path"])
touch_file(path=task.sample_data_path / f["path"])


class TaskViewSetTestCase(APITestCase):
def setUp(self):
self.task = Task.objects.create(
shared_link="https://pan.baidu.com/s/123abc?pwd=def",
shared_id="123abc",
shared_password="def",
)
self.remote_files = [
{
Expand Down Expand Up @@ -247,9 +264,13 @@ def test_delete_remote_files(self, mock_get_baidupcs_client):

def test_delete_local_files(self):
id = self.task.id
touch_file(path=self.task.data_path / "test.txt")
assert list_files(settings.DATA_DIR) != []

response = self.client.delete(reverse("task-local-files", args=[id]))

assert response.json() == {str(id): "local files deleted"}
assert list_files(settings.DATA_DIR) == []

@patch("task.views.get_baidupcs_client")
def test_erase(self, mock_get_baidupcs_client):
Expand All @@ -259,3 +280,53 @@ def test_erase(self, mock_get_baidupcs_client):
response = self.client.delete(reverse("task-erase", args=[id]))

assert response.json() == {str(id): "task deleted"}
assert len(Task.objects.filter(pk=id)) == 0

def test_purge(self):
touch_task_files(self.task)
self.task.delete()
assert sorted(list_files(settings.DATA_DIR)) == [
"123abc.def.sample/张楚/孤独的人是可耻的.mp3",
"123abc.def.sample/张楚/蚂蚁蚂蚁.mp3",
"123abc.def/张楚/孤独的人是可耻的.mp3",
"123abc.def/张楚/蚂蚁蚂蚁.mp3",
]

response = self.client.post(reverse("task-purge"))

assert response.json() == {"done": True}
assert sorted(list_files(settings.DATA_DIR)) == [
"baidupcsleecher_trash/123abc.def.sample/张楚/孤独的人是可耻的.mp3",
"baidupcsleecher_trash/123abc.def.sample/张楚/蚂蚁蚂蚁.mp3",
"baidupcsleecher_trash/123abc.def/张楚/孤独的人是可耻的.mp3",
"baidupcsleecher_trash/123abc.def/张楚/蚂蚁蚂蚁.mp3",
]

def test_purge_all(self):
touch_task_files(self.task)
self.task.delete()
assert list_files(settings.DATA_DIR) != []

response = self.client.post(
reverse("task-purge"),
data={"move_to_trash": False},
format="json",
)

assert response.json() == {"done": True}
assert list_files(settings.DATA_DIR) == []

def test_purge_nothing(self):
touch_task_files(self.task)
files = sorted(list_files(settings.DATA_DIR))
assert files == [
"123abc.def.sample/张楚/孤独的人是可耻的.mp3",
"123abc.def.sample/张楚/蚂蚁蚂蚁.mp3",
"123abc.def/张楚/孤独的人是可耻的.mp3",
"123abc.def/张楚/蚂蚁蚂蚁.mp3",
]

response = self.client.post(reverse("task-purge"))

assert response.json() == {"done": True}
assert sorted(list_files(settings.DATA_DIR)) == files
102 changes: 102 additions & 0 deletions task/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import logging
import os
import re
import traceback
from http.cookies import SimpleCookie
from pathlib import Path
from typing import List
from typing import Tuple
from urllib.parse import parse_qs
from urllib.parse import urlparse

Expand Down Expand Up @@ -112,3 +116,101 @@ def match_regex(string: str, regex: str) -> bool:
"""
pattern = re.compile(regex)
return bool(re.match(pattern, string))


def walk_dir(path: Path) -> List[Tuple[Path, List[os.DirEntry]]]:
"""
Recursively walks through a directory and yields tuples containing
the current path and a list of directory entries.

Args:
path (Path): The path to the directory.

Returns:
List[Tuple[Path, List[os.DirEntry]]]: A list of tuples containing
the current path and a list of directory entries.

Examples:
>>> import tempfile
>>> with tempfile.TemporaryDirectory() as temp_dir:
... test_dir = Path(temp_dir) / "test_dir"
... test_dir.mkdir()
... file1 = test_dir / "file1.txt"
... file1.touch()
... sub_dir = test_dir / "sub_dir"
... sub_dir.mkdir()
... file2 = sub_dir / "file2.txt"
... file2.touch()
... entries = list(walk_dir(test_dir))
... len(entries)
2
>>> entries[0][0] == test_dir
True
>>> sorted([i.name for i in entries[0][1]])
['file1.txt', 'sub_dir']
>>> entries[1][0] == sub_dir
True
>>> sorted([i.name for i in entries[1][1]])
['file2.txt']
"""

paths = [path]
while paths:
path = paths.pop(0)
with os.scandir(path) as scandir_it:
entries = list(scandir_it)
yield path, entries
for entry in entries:
if entry.is_dir():
paths.append(path._make_child_relpath(entry.name))


def walk_files(path: Path) -> List[Path]:
"""
>>> import tempfile
>>> with tempfile.TemporaryDirectory() as temp_dir:
... test_dir = Path(temp_dir) / "test_dir"
... test_dir.mkdir()
... file1 = test_dir / "file1.txt"
... file1.touch()
... sub_dir = test_dir / "sub_dir"
... sub_dir.mkdir()
... file2 = sub_dir / "file2.txt"
... file2.touch()
... files = list(walk_files(test_dir))
... len(files)
2
>>> [i.name for i in files]
['file1.txt', 'file2.txt']
"""
for root, entries in walk_dir(path):
for p in entries:
if not p.is_dir():
yield root / p


def list_files(root: Path, without_root=True) -> List[str]:
"""
>>> import tempfile
>>> with tempfile.TemporaryDirectory() as temp_dir:
... test_dir = Path(temp_dir) / "test_dir"
... test_dir.mkdir()
... file1 = test_dir / "file1.txt"
... file1.touch()
... sub_dir = test_dir / "sub_dir"
... sub_dir.mkdir()
... file2 = sub_dir / "file2.txt"
... file2.touch()
... files = list_files(test_dir)
... len(files)
2
>>> files
['file1.txt', 'sub_dir/file2.txt']
"""
result = []
for file_path in walk_files(root):
if without_root:
result.append(str(file_path.relative_to(root)))
else:
result.append(str(file_path))
return result
Loading