Verified Commit 59fd6eef authored by Jakob Moser's avatar Jakob Moser
Browse files

Draft main deduplication code

parent 8ee036e3
Loading
Loading
Loading
Loading
+46 −0
Original line number Diff line number Diff line
from karaokatalog.Library import Library
from karaokatalog.Song import Song
from karaokatalog.deduplicate.find_duplicates import find_duplicates
from karaokatalog.deduplicate.prune import prune
from karaokatalog.util.get_equivalence_classes import get_equivalence_classes

from pathlib import Path
from tqdm import tqdm
import sys
import logging

logging.basicConfig(
    format="%(asctime)s [%(levelname)s] %(message)s", level=logging.INFO
)

if __name__ == "__main__":
    logging.info("Karaokatalog Deduplication started")

    logging.info("Loading library")
    library = Library.from_dir(Path(sys.argv[1]))
    logging.info("Library loaded")

    logging.info("Finding duplicates (songs with identical title and artist)")
    duplicates = find_duplicates(library)
    logging.info("Duplicates found")

    logging.info("Finding exact duplicates (files are 100% identical)")
    # TODO Make this abortable and restartable
    exact_duplicates = tuple(
        equivalence_class
        for songs in tqdm(duplicates.values(), unit=" duplicates")
        for equivalence_class in get_equivalence_classes(songs, Song.has_identic_files)
    )
    logging.info("Exact duplicates found")

    logging.info("Determining songs to prune")
    pruning_instructions = tuple(
        instruction
        for equivalence_class in tqdm(exact_duplicates, unit=" exact duplicates")
        for instruction in prune(equivalence_class)
    )
    logging.info(f"{len(pruning_instructions)} exactly duplicated songs will be deleted")

    # TODO Call all pruning_instructions, to actually delete the files

    logging.info("Karaokatalog Deduplication done")