diff --git a/README.md b/README.md index 936be18..a303e18 100644 --- a/README.md +++ b/README.md @@ -104,7 +104,9 @@ mv -n /home/user/duplicates/bar.txt /home/user/duplicates/✓bar.txt mv -n /home/user/duplicates/third.txt /home/user/duplicates/✓third.txt ``` -# Documentation – `Deduplidog` class +# Documentation + +## Parameters Import the `Deduplidog` class and change its parameters. @@ -112,6 +114,8 @@ Import the `Deduplidog` class and change its parameters. from deduplidog import Deduplidog ``` +Or change these parameter from CLI or TUI, by launching `deduplidog`. + Find the duplicates. Normally, the file must have the same size, date and name. (Name might be just similar if parameters like strip_end_counter are set.) If media_magic=True, media files receive different rules: Neither the size nor the date are compared. See its help. | parameter | type | default | description | @@ -141,4 +145,47 @@ Find the duplicates. Normally, the file must have the same size, date and name. | media_magic | bool | False | Nor the size or date is compared for files with media suffixes.<br>A video is considered a duplicate if it has the same name and a similar number of frames, even if it has a different extension.<br>An image is considered a duplicate if it has the same name and a similar image hash, even if the files are of different sizes.<br>(This mode is considerably slower.) | | accepted_frame_delta | int | 1 | Used only when media_magic is True | | accepted_img_hash_diff | int | 1 | Used only when media_magic is True | -| img_compare_date | bool | False | If True and media_magic=True, the file date or the EXIF date must match. | \ No newline at end of file +| img_compare_date | bool | False | If True and media_magic=True, the file date or the EXIF date must match. | + +## Utils +In the `deduplidog.utils` packages, you'll find a several handsome tools to help you. You will find parameters by using you IDE hints. + +### `images` +*`urls: Iterable[str | Path]`* Display a ribbon of images. + +### `print_video_thumbs` +*`src: str | Path`* Displays thumbnails for a video. + +### `print_videos_thumbs` +*`dir_: Path`* To quickly understand the content of each video, output the duration and the first few frames. + +### `get_frame_count` +*`filename: str|Path`* Uses cv2 to determine the video frame count. Method is cached. + +### `search_for_media_wizzard` +*`cwd: str`* Repeatedly prompt and search for files with similar names somewhere in the specified path. Display all such files as images and video previews. + +### `are_contained` +*`work_dir: str, original_dir: str, sec_range: int = 60`* You got two dirs with files having different naming system (427.JPG vs DSC_1344) + which you suspect to contain the same set. The same files in the dirs seem to have the same timestamp. + The same timestamp means +/- sec_range (ex: 1 minute). + Loop all files from work_dir and display corresponding files having the same timestamp. + or warn that no original exists. + +### `remove_prefix_in_workdir` +*`work_dir: str`* Removes the prefix ✓ recursively from all the files. The prefix might have been previously given by the deduplidog. + + +### `mark_symlink_by_target` +*`suspicious_directory: str | Path, starting_path: str`* If the file is a symlink, pointing to this path, rename it with an arrow. + +``` +:param suspicious_directory: Ex: /media/user/disk/Takeout/Photos/ +:param starting_path: Ex: /media/user/disk +``` + +### `mark_symlink_only_dirs` +*`dir_: str | Path`* If the directory is full of only symlinks or empty, rename it to an arrow. + +### `mtime_files_in_dir_according_to_json` +*`dir_: str | Path, json_dir: str | Path`* Google Photos returns JSON with the photo modification time. Sets the photos from the dir_ to the dates fetched from the directory with these JSONs. diff --git a/deduplidog/__main__.py b/deduplidog/__main__.py index 3f10e75..ccd0780 100644 --- a/deduplidog/__main__.py +++ b/deduplidog/__main__.py @@ -72,11 +72,15 @@ def cli(dd: Deduplidog): return dd -if __name__ == "__main__": +def main(): + global INPUTS + # CLI try: dd = cli() - if input("Continue? [Y/n] ").casefold() not in ("", "y"): + if not dd: # maybe just --help + return + if input("See more options? [Y/n] ").casefold() not in ("", "y"): sys.exit() except click.MissingParameter: # User launched the program without parameters. @@ -110,3 +114,6 @@ def cli(dd: Deduplidog): continue if input("See more options? [Y/n] ").casefold() not in ("y", ""): break + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/deduplidog/deduplidog.py b/deduplidog/deduplidog.py index 5a9517d..ccf8772 100644 --- a/deduplidog/deduplidog.py +++ b/deduplidog/deduplidog.py @@ -1,4 +1,3 @@ -import json import logging import os import re @@ -7,24 +6,19 @@ from dataclasses import dataclass from datetime import datetime from functools import cache -from itertools import chain from pathlib import Path from time import sleep from typing import Annotated, get_args, get_type_hints -from zlib import crc32 import click -import cv2 import imagehash from dataclass_click import option from humanize import naturaldelta, naturalsize -from IPython.display import Image, clear_output, display -from ipywidgets import HBox, widgets from PIL import ExifTags, Image -from sh import find from tqdm.autonotebook import tqdm from .interface_utils import Field +from .utils import _qp, crc, get_frame_count VIDEO_SUFFIXES = ".mp4", ".mov", ".avi", ".vob", ".mts", ".3gp", ".mpg", ".mpeg", ".wmv" IMAGE_SUFFIXES = ".jpg", ".jpeg", ".png", ".gif" @@ -528,9 +522,9 @@ def _find_similar(self, work_file: Path, candidates: list[Path]): for original in candidates: ost, wst = original.stat(), work_file.stat() if (self.ignore_date - or wst.st_mtime == ost.st_mtime - or self.tolerate_hour and self.tolerate_hour[0] <= (wst.st_mtime - ost.st_mtime)/3600 <= self.tolerate_hour[1] - ) and (self.ignore_size or wst.st_size == ost.st_size and (not self.checksum or crc(original) == crc(work_file))): + or wst.st_mtime == ost.st_mtime + or self.tolerate_hour and self.tolerate_hour[0] <= (wst.st_mtime - ost.st_mtime)/3600 <= self.tolerate_hour[1] + ) and (self.ignore_size or wst.st_size == ost.st_size and (not self.checksum or crc(original) == crc(work_file))): return original def _find_similar_media(self, work_file: Path, comparing_image: bool, candidates: list[Path]): @@ -609,209 +603,3 @@ def _print_change(self, change: Change): for text, changes in zip((f" {wicon}{self.work_dir_name}:", f" {oicon}{self.original_dir_name}:"), change.values()) if len(changes)] - -@cache -def crc(path: Path): - """ Surprisingly, sha256 and sha1 was faster than md5 when using hashlib.file_digest. However crc32 is still the fastest.""" - crc = 0 - with path.open('rb') as f: - while True: - chunk = f.read(4096) - if not chunk: - break - crc = crc32(chunk, crc) - return crc - - -def _qp(path: Path): - """Quoted path. Output path to be used in bash. I wonder there is no system method which covers - quotes in the path etc. - """ - s = str(path) - return f'"{s}"' if " " in s else s - - -# TODO: below are some functions that should be converted into documented utils or removed - -def remove_prefix_in_workdir(work_dir: str): - """ Removes the prefix ✓ recursively from all the files. - The prefix might have been previously given by the deduplidog. """ - work_files = [f for f in tqdm(Path(work_dir).rglob("*"), desc="Caching working files") if f.is_file()] - for p in work_files: - p.rename(p.with_stem(p.stem.removeprefix("✓"))) - - -@cache -def get_frame_count(filename): - video = cv2.VideoCapture(str(filename)) - # duration = video.get(cv2.CAP_PROP_POS_MSEC) - frame_count = video.get(cv2.CAP_PROP_FRAME_COUNT) - return frame_count - - -def mark_symlink_by_target(suspicious_directory: str | Path, starting_path): - """ If the file is a symlink, pointing to this path, rename it with an arrow - - :param suspicious_directory: Ex: /media/user/disk/Takeout/Photos/ - :param starting_path: Ex: /media/user/disk - """ - for f in (x for x in Path(suspicious_directory).rglob("*") if x.is_symlink()): - if str(f.resolve()).startswith(starting_path): - print(f.rename(f.with_name("→" + f.name))) - print(f) - -# Opakovane vyhledavat, zde se soubory podobneho jmena naleza nekde v dane ceste. -# Zobrazit jako obrazky a nahledy videi vsechny takove soubory. - - -def _stub(): - while True: - a = input() - clear_output() - cwd = "/media/user/disk1/Photos/" - print("Searching", a, "in", cwd) - files = find("-iname", f"*{a}*", _cwd=cwd) - files = [Path(cwd, f.strip()) for f in files] - print("Len", len(files)) - images(files) - [print_video_thumbs(f) for f in files] - - -def _are_similar(original: Path, work_file: Path, accepted_img_hash_diff: int = 1): - original_pil = Image.open(original) - work_pil = Image.open(work_file) - hash0 = imagehash.average_hash(original_pil) - hash1 = imagehash.average_hash(work_pil) - # maximum bits that could be different between the hashes - return abs(hash0 - hash1) <= accepted_img_hash_diff - - -def are_contained(work_dir, original_dir, sec_range: int = 60): - """ You got two dirs with files having different naming system (427.JPG vs DSC_1344) - which you suspect to contain the same set. The same files in the dirs seem to have the same timestamp. - The same timestamp means +/- sec_range (ex: 1 minute). - Loop all files from work_dir and display corresponding files having the same timestamp. - or warn that no original exists- - """ - - # build directory of originals - global originals, found - originals = defaultdict(set) # [timestamp] = set(originals...) - for of in Path(original_dir).rglob("*"): - originals[of.stat().st_mtime].add(of) - - found = {} - for wf in (bar := tqdm(list(Path(work_dir).rglob("*")))): - bar.set_postfix({"file": str(wf.name), "found": len(found)}) - - timestamp = wf.stat().st_mtime - # 0, -1, 1, -2, 2 ... to find candidate earlier - range_ = sorted(range(-sec_range, sec_range+1), key=lambda x: abs(x)) - corresponding = (originals.get(timestamp + i, set()) - for i in range_) # find all originals with similar timestamps - # flatten the sets and unique them (but keep as list to preserve files with less timestamp difference first) - corresponding = list(dict.fromkeys(chain.from_iterable(corresponding))) - - if corresponding: - for candidate in (bar2 := tqdm(corresponding, leave=False, desc="Candidates")): - bar2.set_postfix({"file": candidate.name}) - if _are_similar(candidate, wf): - found[wf] = candidate - # tqdm would not dissappear if not finished https://github.com/tqdm/tqdm/issues/1382 - bar2.update(float("inf")) - bar2.close() - break - else: - print("No candidate for", wf.name, corresponding) - images([wf] + list(corresponding)) - else: - print("Missing originals for", wf.name) - - -# are_contained("/media/user/disk1/Photos/_tabor/2/", "/media/user/disk1/Photos/tabory/C 074 2016/") - - -def images(urls): - """ Display a ribbon of images """ - images_ = [] - for url in tqdm(urls, leave=False): - p = Path(url) - if p.exists(): - images_.append(widgets.Image(width=150, value=p.read_bytes())) - else: - print("Fail", p) - display(HBox(images_)) - - -def print_video_thumbs(src): - vidcap = cv2.VideoCapture(str(src)) - success, image = vidcap.read() - count = 0 - images = [] - while success: - success, image = vidcap.read() - if count % 100 == 0: - try: - # images.append(Image(width=150, data=cv2.imencode('.jpg', image)[1])) - images.append(widgets.Image(width=150, value=cv2.imencode('.jpg', image)[1])) - except: - break - if count > 500: - break - count += 1 - print(src, get_frame_count(src)) - if images: - display(HBox(images)) - - -def get_video_thumbs(dir_): - """ Abych rychle poznal, co v kterem videu je, vypsat delku a prvnich par screenu """ - for f in sorted(Path(dir_).rglob("*")): - if f.suffix.lower() in (".mov", ".avi", ".mp4", ".vob"): - print_video_thumbs(f) - - -get_video_thumbs("/media/user/disk1/Photos/dram/") - - -def mark_symlink_only_dirs(dir_): - """ Pokud je adresar plny jen symlinku nebo prazdny, přijmenovat mu šipku """ - for d in (x for x in Path(dir_).rglob("*") if x.is_dir()): - if all(x.is_symlink() for x in Path(d).glob("*")): - print(d.rename(d.with_name("→" + d.name))) - - -# mark_symlink_only_dirs("/media/user/disk2/Takeoutuser/Google Photos/") - - -def mark_01_copies(suspicious_directory): - for f in (x for x in Path(suspicious_directory).glob("*(1)*")): - stem = f.stem.removesuffix("(1)") - - for x in (x for x in Path("/media/user/disk2/_duplikaty_smazat/").rglob("*") if x.stem.removeprefix("✓") == stem): - print(f.rename(f.with_name("→" + f.name))) -# mark_01_copies("/media/user/disk2/Takeoutuser/YouTube and YouTube Music/videos/") - - -def mtime_files_in_dir_according_to_json(dir_, json_dir): - """ google photos vrací json, kde je čas fotky - Kromě JPG. - """ - for photo in Path(dir_).rglob("*"): - # if photo.suffix.lower() in (".jpg", ".jpeg"): - # continue - # if "50607264_2240519186012556_9095104762705084416_o.jpg" not in photo.name: - # continue - metadata = Path(json_dir).joinpath(photo.name[:46] + ".json") - if metadata.exists(): - # if photo.stat().st_mtime < 1654812000: - # zmenit jenom takove soubory, ktere uz nebyly zmeneny jinak, - # coz poznam tak, ze jejich datum je 10.6.2022 - # continue - timestamp = json.loads(metadata.read_text())["photoTakenTime"]["timestamp"] - os.utime(photo, (int(timestamp), int(timestamp))) - print(photo) - # break - -# mtime_files_in_dir_according_to_json("/media/user/disk2/Takeoutuser/Google Photos/Photos from 2019/", - # "/media/user/disk2/photos_json/") diff --git a/deduplidog/utils.py b/deduplidog/utils.py new file mode 100644 index 0000000..a597d4b --- /dev/null +++ b/deduplidog/utils.py @@ -0,0 +1,191 @@ +import json +import os +from collections import defaultdict +from functools import cache +from itertools import chain +from pathlib import Path +from typing import Iterable +from zlib import crc32 + +import cv2 +import imagehash +from IPython.display import clear_output, display +from ipywidgets import HBox, widgets +from PIL import Image +from sh import find +from tqdm.autonotebook import tqdm + + +@cache +def crc(path: Path): # undocumented function + """ Count CRC32 file hash. + Surprisingly, sha256 and sha1 was faster than md5 when using hashlib.file_digest. However crc32 is still the fastest.""" + crc = 0 + with path.open('rb') as f: + while True: + chunk = f.read(4096) + if not chunk: + break + crc = crc32(chunk, crc) + return crc + + +def _qp(path: Path): + """Quoted path. Output path to be used in bash. I wonder there is no system method which covers + quotes in the path etc. + """ + s = str(path) + return f'"{s}"' if " " in s else s + + +def images(urls: Iterable[str | Path]): + """ Display a ribbon of images. """ + images_ = [] + for url in tqdm(urls, leave=False): + p = Path(url) + if p.exists(): + images_.append(widgets.Image(width=150, value=p.read_bytes())) + else: + print("Fail", p) + display(HBox(images_)) + + +def print_video_thumbs(src: str | Path): + """ Displays thumbnails for a video """ + vidcap = cv2.VideoCapture(str(src)) + success, image = vidcap.read() + count = 0 + images = [] + while success: + success, image = vidcap.read() + if count % 100 == 0: + try: + # images.append(Image(width=150, data=cv2.imencode('.jpg', image)[1])) + images.append(widgets.Image(width=150, value=cv2.imencode('.jpg', image)[1])) + except: + break + if count > 500: + break + count += 1 + print(src, get_frame_count(src)) + if images: + display(HBox(images)) + + +def print_videos_thumbs(dir_: Path): + """ To quickly understand the content of each video, output the duration and the first few frames. """ + for f in sorted(Path(dir_).rglob("*")): + if f.suffix.lower() in (".mov", ".avi", ".mp4", ".vob"): + print_video_thumbs(f) + + +@cache +def get_frame_count(filename: str | Path): + """ Uses cv2 to determine the video frame count. Method is cached.""" + video = cv2.VideoCapture(str(filename)) + # duration = video.get(cv2.CAP_PROP_POS_MSEC) + frame_count = video.get(cv2.CAP_PROP_FRAME_COUNT) + return frame_count + + +def search_for_media_wizzard(cwd: str): + """ Repeatedly prompt and search for files with similar names somewhere in the specified path. + Display all such files as images and video previews. """ + while True: + query = input() + clear_output() + print("Searching", query, "in", cwd) + files = find("-iname", f"*{query}*", _cwd=cwd) + files = [Path(cwd, f.strip()) for f in files] + print("Len", len(files)) + images(files) + [print_video_thumbs(f) for f in files] + + +def _are_similar(original: Path, work_file: Path, accepted_img_hash_diff: int = 1): + original_pil = Image.open(original) + work_pil = Image.open(work_file) + hash0 = imagehash.average_hash(original_pil) + hash1 = imagehash.average_hash(work_pil) + # maximum bits that could be different between the hashes + return abs(hash0 - hash1) <= accepted_img_hash_diff + + +def are_contained(work_dir: str, original_dir: str, sec_range: int = 60): + """ You got two dirs with files having different naming system (427.JPG vs DSC_1344) + which you suspect to contain the same set. The same files in the dirs seem to have the same timestamp. + The same timestamp means +/- sec_range (ex: 1 minute). + Loop all files from work_dir and display corresponding files having the same timestamp. + or warn that no original exists. + """ + + # build directory of originals + originals = defaultdict(set) # [timestamp] = set(originals...) + for of in Path(original_dir).rglob("*"): + originals[of.stat().st_mtime].add(of) + + found = {} + for wf in (bar := tqdm(list(Path(work_dir).rglob("*")))): + bar.set_postfix({"file": str(wf.name), "found": len(found)}) + + timestamp = wf.stat().st_mtime + # 0, -1, 1, -2, 2 ... to find candidate earlier + range_ = sorted(range(-sec_range, sec_range+1), key=lambda x: abs(x)) + corresponding = (originals.get(timestamp + i, set()) + for i in range_) # find all originals with similar timestamps + # flatten the sets and unique them (but keep as list to preserve files with less timestamp difference first) + corresponding = list(dict.fromkeys(chain.from_iterable(corresponding))) + + if corresponding: + for candidate in (bar2 := tqdm(corresponding, leave=False, desc="Candidates")): + bar2.set_postfix({"file": candidate.name}) + if _are_similar(candidate, wf): + found[wf] = candidate + # tqdm would not dissappear if not finished https://github.com/tqdm/tqdm/issues/1382 + bar2.update(float("inf")) + bar2.close() + break + else: + print("No candidate for", wf.name, corresponding) + images([wf] + list(corresponding)) + else: + print("Missing originals for", wf.name) + + +def remove_prefix_in_workdir(work_dir: str): + """ Removes the prefix ✓ recursively from all the files. + The prefix might have been previously given by the deduplidog. """ + work_files = [f for f in tqdm(Path(work_dir).rglob("*"), desc="Caching working files") if f.is_file()] + for p in work_files: + p.rename(p.with_stem(p.stem.removeprefix("✓"))) + + +def mark_symlink_by_target(suspicious_directory: str | Path, starting_path: str): + """ If the file is a symlink, pointing to this path, rename it with an arrow + + :param suspicious_directory: Ex: /media/user/disk/Takeout/Photos/ + :param starting_path: Ex: /media/user/disk + """ + for f in (x for x in Path(suspicious_directory).rglob("*") if x.is_symlink()): + if str(f.resolve()).startswith(starting_path): + print(f.rename(f.with_name("→" + f.name))) + print(f) + + +def mark_symlink_only_dirs(dir_: str | Path): + """If the directory is full of only symlinks or empty, rename it to an arrow.""" + for d in (x for x in Path(dir_).rglob("*") if x.is_dir()): + if all(x.is_symlink() for x in Path(d).glob("*")): + print(d.rename(d.with_name("→" + d.name))) + + +def mtime_files_in_dir_according_to_json(dir_: str | Path, json_dir: str | Path): + """ Google Photos returns JSON with the photo modification time. + Sets the photos from the dir_ to the dates fetched from the directory with these JSONs. + """ + for photo in Path(dir_).rglob("*"): + metadata = Path(json_dir).joinpath(photo.name[:46] + ".json") + if metadata.exists(): + timestamp = json.loads(metadata.read_text())["photoTakenTime"]["timestamp"] + os.utime(photo, (int(timestamp), int(timestamp))) + print(photo) diff --git a/pyproject.toml b/pyproject.toml index 877e464..70938ff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,4 +21,7 @@ opencv-python = "*" Pillow = "*" textual = "~=0.52.1" sh = "*" -tqdm = "*" \ No newline at end of file +tqdm = "*" + +[tool.poetry.scripts] +deduplidog= "deduplidog.__main__:main" \ No newline at end of file