tooling: faster repo dupe + std elapsed string gen

rewrite a few things on sidestepper so that we can get back the sim and ignored directories found by the large file finding algorithn (LargeFileFilterResult) from 23.5″ to 4.6″ (58″ overall to ~35″ overall) -- approx 5x faster rn
2024-07-27 02:28:57 +08:00 · 2024-07-27 02:28:57 +08:00 · e4639b03df
commit e4639b03df
parent b3ea2625d5
2 changed files with 154 additions and 63 deletions
--- a/sidestepper.py
+++ b/sidestepper.py
@ -205,6 +205,24 @@ def one_sided(a: A, bbb: Iterable[B]) -> Iterator[OneSided[A, B]]:
        yield OneSided(a, b)


+def generate_time_elapsed_string(time_taken: float) -> str:
+    """generates a human-readable time-elapsed string from a time taken float"""
+    hours = int(time_taken // 3600)
+    minutes = int(time_taken % 3600 // 60)
+    seconds = int(time_taken % 60)
+
+    time_taken_string: str
+
+    if time_taken > 3600:
+        time_taken_string = f"{hours}h {minutes}′ {seconds}″"
+    elif time_taken > 60:
+        time_taken_string = f"{minutes}′ {seconds}″"
+    else:
+        time_taken_string = f"{time_taken:.2f}″"
+
+    return time_taken_string
+
+
@dataclass(eq=True, frozen=True)
 class SideStepIgnoreMatcher:
    """immutable gitignore matcher"""
@ -234,7 +252,7 @@ class SideStepIgnoreMatcher:
            root=self.root, rules=self.rules + ((gitignore.parent, tuple(new_ruleset)),)
        )

-    def match(self, file: Path) -> bool:
+    def match(self, file: Path | str) -> bool:
        """returns True if the file is ignored by any of the rules in the gitignore files, False otherwise"""
        matched = False

@ -272,6 +290,24 @@ class SideStepIgnoreMatcher:
        return any(rule.negation for rule in ruleset)


+@dataclass(eq=True, frozen=True)
+class LargeFileFilterResult:
+    """
+    result data structure of the large file filter
+
+    files: tuple[Path, ...]
+        large files found
+    matcher: SideStepIgnoreMatcher
+        the *ignore matcher instance
+    ignore_directories: tuple[Path, ...]
+        directories that were ignored
+    """
+
+    files: tuple[Path, ...]
+    matcher: SideStepIgnoreMatcher
+    ignore_directories: tuple[Path, ...]
+
+
 def _parallel() -> bool:
    """
    helper function to determine if we should use multiprocessing;
@ -311,7 +347,7 @@ def _iter_files(
        yield target_file


-def iter_files(target_dir: Path) -> tuple[list[Path], SideStepIgnoreMatcher]:
+def iter_files(target_dir: Path) -> tuple[tuple[Path, ...], SideStepIgnoreMatcher]:
    """
    get all non-git files and register .gitignore files

@ -319,8 +355,8 @@ def iter_files(target_dir: Path) -> tuple[list[Path], SideStepIgnoreMatcher]:
        target_dir: Path
            the directory to search in

-    returns: tuple[list[Path], SideStepIgnoreMatcher]
-        list of all files in the target directory and a SideStepIgnoreMatcher instance
+    returns: tuple[tuple[Path, ...], SideStepIgnoreMatcher]
+        tuple of all files in the target directory and a SideStepIgnoreMatcher instance
    """

    all_files: list[Path] = []
@ -335,7 +371,7 @@ def iter_files(target_dir: Path) -> tuple[list[Path], SideStepIgnoreMatcher]:
        if file.name == ".gitignore":
            sim = sim.add_gitignore(file)

-    return all_files, sim
+    return tuple(all_files), sim


 def _filter_sim_match(
@ -372,9 +408,10 @@ def _filter_ign_dirs_and_size(os: OneSided[list[Path], Path]) -> Path | None:
        return None


-def _find_large_files_single(target: Path) -> list[Path]:
+def _find_large_files_single(
+    files: tuple[Path, ...], sim: SideStepIgnoreMatcher
+) -> LargeFileFilterResult:
    """single-process implementation of find_large_files"""
-    files, sim = iter_files(target)
    ignore_dirs: list[Path] = []

    _files = []
@ -394,15 +431,21 @@ def _find_large_files_single(target: Path) -> list[Path]:
        leave=False,
        total=len(_files),
    ):
-        if f := _filter_ign_dirs_and_size(fds_os):
+        f = _filter_ign_dirs_and_size(fds_os)
+        if f is not None:
            large_files.append(f)

-    return large_files
+    return LargeFileFilterResult(
+        files=tuple(large_files),
+        matcher=sim,
+        ignore_directories=tuple(ignore_dirs),
+    )


-def _find_large_files_parallel(target: Path) -> list[Path]:
+def _find_large_files_parallel(
+    files: tuple[Path, ...], sim: SideStepIgnoreMatcher
+) -> LargeFileFilterResult:
    """multiprocess implementation of find_large_files"""
-    files, sim = iter_files(target)
    manager = Manager()
    ignore_dirs: ListProxy[Path] = manager.list()

@ -420,40 +463,51 @@ def _find_large_files_parallel(target: Path) -> list[Path]:
        if f is not None
    ]

-    return [
-        f
-        for f in process_map(
-            _filter_ign_dirs_and_size,
-            one_sided(a=ignore_dirs, bbb=_files),
-            desc="1 pre | finding large files - dir rematching (3/3)",
-            leave=False,
-            chunksize=SOTA_SIDESTEP_CHUNK_SIZE,
-            max_workers=SOTA_SIDESTEP_MAX_WORKERS,
-            total=len(files),
-        )
-        if f is not None
-    ]
+    large_files: tuple[Path, ...] = tuple(
+        [
+            f
+            for f in process_map(
+                _filter_ign_dirs_and_size,
+                one_sided(a=ignore_dirs, bbb=_files),
+                desc="1 pre | finding large files - dir rematching (3/3)",
+                leave=False,
+                chunksize=SOTA_SIDESTEP_CHUNK_SIZE,
+                max_workers=SOTA_SIDESTEP_MAX_WORKERS,
+                total=len(files),
+            )
+            if f is not None
+        ]
+    )
+
+    return LargeFileFilterResult(
+        files=large_files,
+        matcher=sim,
+        ignore_directories=tuple(ignore_dirs),
+    )


-def find_large_files(target: Path) -> list[Path]:
+def find_large_files(
+    files: tuple[Path, ...], matcher: SideStepIgnoreMatcher
+) -> LargeFileFilterResult:
    """
    finds all files larger than a certain size in a directory;
    uses SOTA_SIDESTEP_LARGE_FILE_SIZE as the size threshold

    args:
-        target_dir: Path
-            the directory to search in
+        files: tuple[Path, ...]
+            list of files to search through
+        matcher: SideStepIgnoreMatcher
+            the ignore matcher instance from iter_files()

-    returns: list[Path]
-        list of large files
+    returns: LargeFileFilterResult
    """
    if _parallel():
-        return _find_large_files_parallel(target)
+        return _find_large_files_parallel(files, matcher)
    else:
-        return _find_large_files_single(target)
+        return _find_large_files_single(files, matcher)


-def write_sotaignore(large_files: list[Path]) -> bool:
+def write_sotaignore(large_files: tuple[Path, ...]) -> bool:
    """
    writes out a .sotaignore file with a list of large files,
    updating an existing one if already present
@ -514,23 +568,35 @@ def main() -> None:

    cumulative_start_time = time()

-    print(f"1/2{INDENT}finding large files... ", end="", file=stderr)
+    print(f"1/3{INDENT}pre-scanning repository... ", end="", file=stderr)
    start_time = time()
-    large_files = find_large_files(REPO_DIR)
+    files, sim = iter_files(REPO_DIR)
    end_time = time()
    print(
-        f"1/2{INDENT}finding large files... "
-        f"done in {end_time - start_time:.2f}″ "
+        f"1/3{INDENT}pre-scanning repository... "
+        f"done in {generate_time_elapsed_string(end_time - start_time)} "
+        f"(found {len(files)})",
+        file=stderr,
+    )
+
+    print(f"2/3{INDENT}finding large files... ", end="", file=stderr)
+    start_time = time()
+    large_files = find_large_files(files, sim).files
+    end_time = time()
+    print(
+        f"2/3{INDENT}finding large files... "
+        f"done in {generate_time_elapsed_string(end_time - start_time)} "
        f"(found {len(large_files)})",
        file=stderr,
    )

-    print(f"2/2{INDENT}writing .sotaignore file... ", end="", file=stderr)
+    print(f"3/3{INDENT}writing .sotaignore file... ", end="", file=stderr)
    start_time = time()
    was_written = write_sotaignore(large_files)
    end_time = time()
    print(
-        ("done" if was_written else "skipped") + f" in {end_time - start_time:.2f}″\n",
+        ("done" if was_written else "skipped")
+        + f" in {generate_time_elapsed_string(end_time - start_time)}\n",
        file=stderr,
    )

@ -538,14 +604,9 @@ def main() -> None:
        print(file.relative_to(REPO_DIR))

    cumulative_end_time = time()
-    time_taken = cumulative_end_time - cumulative_start_time
-    time_taken_string: str
-    if time_taken > 60:
-        time_taken_string = f"{int(time_taken // 60)}′{int(time_taken % 60)}″"
-    else:
-        time_taken_string = f"{time_taken:.2f}″"
    print(
-        f"\n--- done! took {time_taken_string}~ " "☆*: .｡. o(≧▽≦)o .｡.:*☆ ---",
+        f"\n--- done! took {generate_time_elapsed_string(cumulative_end_time - cumulative_start_time)}~ "
+        "☆*: .｡. o(≧▽≦)o .｡.:*☆ ---",
        flush=True,
        file=stderr,
    )
--- a/sync.py
+++ b/sync.py
@ -17,8 +17,11 @@ from typing import Callable, Final, TypeVar
 try:
    from sidestepper import (
        SOTA_SIDESTEP_MAX_WORKERS,
+        LargeFileFilterResult,
        find_large_files,
        generate_command_failure_message,
+        generate_time_elapsed_string,
+        iter_files,
        run,
        write_sotaignore,
    )
@ -54,7 +57,7 @@ class CopyHighway:
    for use with shutil.copytree(); also displays a progress bar
    """

-    def __init__(self, message: str, total: int):
+    def __init__(self, message: str, total: int, lff_result: LargeFileFilterResult):
        """
        multithreaded file copying class that gives a copy2-like function
        for use with shutil.copytree()
@ -64,6 +67,8 @@ class CopyHighway:
                message to display in the progress bar
            total: int
                total number of files to copy
+            lff_result: LargeFileFilterResult
+                result of the large file filter
        """
        self.pool = ThreadPool(
            processes=SOTA_SIDESTEP_MAX_WORKERS,
@ -74,13 +79,27 @@ class CopyHighway:
            unit=" files",
            leave=False,
        )
+        self.lff_result = lff_result

    def callback(self, a: R):
        self.pbar.update()
        return a

-    def copy2(self, source: str, dest: str):
+    def copy2(self, source: Path | str, dest: Path | str) -> None:
        """shutil.copy2()-like function for use with shutil.copytree()"""
+
+        # ignore check 1: dir
+        for ign_dir in self.lff_result.ignore_directories:
+            if str(ign_dir) in str(source):
+                return None
+
+        # ignore check 2: file
+        # ... we don't need to use the trytrytry method
+        # ... because we already did that as part of the large file filter,
+        # ... and as such we checked for it with the first check above
+        if self.lff_result.matcher.match(source):
+            return None
+
        self.pool.apply_async(copy2, args=(source, dest), callback=self.callback)

    def __enter__(self):
@ -286,7 +305,10 @@ def step(

    # yay
    if desc != "" and post_print:
-        print(f" done in {end_time - start_time:.2f}″", flush=True)
+        print(
+            f" done in {generate_time_elapsed_string(end_time - start_time)}",
+            flush=True,
+        )

    return rp

@ -402,14 +424,17 @@ def main() -> None:
                "critical error: repository is not clean, please commit changes first",
            )

+        start_time = time()
+        print("1 pre | finding large files", end="", flush=True)
+        files, sim = iter_files(REPO_DIR)
+
        if "--skipsotaignoregen" not in argv:
-            (print("1 pre | finding large files", end="", flush=True),)
-            start_time = time()
-            large_files = find_large_files(REPO_DIR)
+            flf_filter_result = find_large_files(files, sim)
+            large_files = flf_filter_result.files
            end_time = time()
            print(
                "1 pre | finding large files... "
-                f"done in {end_time - start_time:.2f}″ (found {len(large_files)})"
+                f"done in {generate_time_elapsed_string(end_time - start_time)} (found {len(large_files)})"
            )

            if large_files:
@ -422,15 +447,25 @@ def main() -> None:
                )
                end_time = time()
                if was_written:
-                    print(f" done in {end_time - start_time:.2f}″")
+                    print(
+                        f" done in {generate_time_elapsed_string(end_time - start_time)}"
+                    )
                else:
                    print(" not needed")
+        else:
+            end_time = time()
+            print(
+                "1 pre | finding large files... "
+                f"skipped in {generate_time_elapsed_string(end_time - start_time)}"
+            )

        print("3 pre | duplicating repo... pre-scanning", end="", flush=True)

        start_time = time()
        with CopyHighway(
-            "3 pre | duplicating repo", total=len(list(REPO_DIR.rglob("*")))
+            message="3 pre | duplicating repo",
+            total=len(list(REPO_DIR.rglob("*"))),
+            lff_result=flf_filter_result,
        ) as copier:
            copytree(
                src=REPO_DIR,
@ -440,7 +475,7 @@ def main() -> None:
            )
        end_time = time()
        print(
-            f"3 pre | duplicating repo... done in {end_time - start_time:.2f}″",
+            f"3 pre | duplicating repo... done in {generate_time_elapsed_string(end_time - start_time)}",
            flush=True,
        )

@ -548,7 +583,7 @@ def main() -> None:
            r["remote/github"] = "github"

        step(
-            desc=f"X fin | pushing to github/{branch}",
+            desc=f"X fin | pushing to {r['remote/github']}/{branch}",
            func=cmd(
                f"git push {r['remote/github']} {branch} --force"
                if ("--test" not in argv)
@ -557,14 +592,9 @@ def main() -> None:
        )

    cumulative_end_time = time()
-    time_taken = cumulative_end_time - cumulative_start_time
-    time_taken_string: str
-    if time_taken > 60:
-        time_taken_string = f"{int(time_taken // 60)}′{int(time_taken % 60)}″"
-    else:
-        time_taken_string = f"{time_taken:.2f}″"
    print(
-        f"\n--- done! took {time_taken_string}~ " "☆*: .｡. o(≧▽≦)o .｡.:*☆ ---",
+        f"\n--- done! took {generate_time_elapsed_string(cumulative_end_time - cumulative_start_time)}~ "
+        "☆*: .｡. o(≧▽≦)o .｡.:*☆ ---",
        flush=True,
    )