From 2e65e1bf9aae2b94be306ee60aa83cfa1c7170d2 Mon Sep 17 00:00:00 2001 From: Jose134 Date: Sun, 8 Dec 2024 05:53:12 +0100 Subject: [PATCH] Remove duplicate files using Czkawka --- .gitignore | 3 +++ README.md | 2 +- src/deduplication.py | 31 +++++++++++++++++++++++++++++++ src/main.py | 3 +++ 4 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 src/deduplication.py diff --git a/.gitignore b/.gitignore index 82f9275..7fcfb17 100644 --- a/.gitignore +++ b/.gitignore @@ -160,3 +160,6 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ + + +vendor/czkawka/* \ No newline at end of file diff --git a/README.md b/README.md index 6a3ab65..f52c210 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ Store all the logs for the current job in order to be able to debug and keep tra - [x] Read qbittorrent credentials from .env file - [x] Implement API endpoint using FastAPI - [x] Run organization job on a separate thread -- [ ] Deduplicate files using Czkawka +- [x] Deduplicate files using Czkawka - [ ] Add unit tests - [ ] Add logging - [ ] Make it run in docker diff --git a/src/deduplication.py b/src/deduplication.py new file mode 100644 index 0000000..13f6b77 --- /dev/null +++ b/src/deduplication.py @@ -0,0 +1,31 @@ +import subprocess +import os +from enum import Enum + +class CZKAWKA_DELETION_METHOD(Enum): + ALL_EXCEPT_NEWEST = "AEN" + ALL_EXCEPT_OLDEST = "AEO" + ALL_EXCEPT_BIGGEST = "AEB" + ALL_EXCEPT_SMALLEST = "AES" + NONE="NONE" + +def deduplicate_files(target_dir, exclude_files): + czkawka_path = os.environ.get("CZKAWKA_PATH", "/app/vendor/czkawka/czkawka_cli") + duplicates = _remove_duplicates( + czkawka_path, target_dir, exclude_files, CZKAWKA_DELETION_METHOD.ALL_EXCEPT_SMALLEST) + +def _remove_duplicates(czkawka_path, target_dir, exclude_files, delete_method: CZKAWKA_DELETION_METHOD): + try: + flags = ["video", "--directories", target_dir, "--not-recursive", "--delete-method", delete_method.value] + if exclude_files: + flags.append("--excluded-items") + flags.extend(exclude_files) + flags.append("--tolerance") + flags.append(os.environ.get("CK_DUPLICATE_TOLERANCE", "2")) + print(flags) + + result = subprocess.run([czkawka_path, *flags], capture_output=True, text=True, check=True) + print(result.stdout) + + except subprocess.CalledProcessError as e: + print(f"Failed to find duplicates: {e.stderr}") diff --git a/src/main.py b/src/main.py index b1a18db..3111f0f 100644 --- a/src/main.py +++ b/src/main.py @@ -1,9 +1,11 @@ import os from fastapi import BackgroundTasks, FastAPI from dotenv import load_dotenv +from os import path from qbittorrent_api import get_qbittorrent_files_downloading from filemoving import group_files_by_prefix +from deduplication import deduplicate_files import uuid import time @@ -33,4 +35,5 @@ def launch_job(job_id): f.write(f"{job_id}\n") # downloading = get_qbittorrent_files_downloading(qbit_url, qbit_user, qbit_password) + # deduplicate_files(target_dir, downloading) # group_files_by_prefix(target_dir, downloading)