tooling: faster repo dupe + std elapsed string gen

rewrite a few things on sidestepper so that we can get back the sim
and ignored directories found by the large file finding algorithn (LargeFileFilterResult)

from 23.5″ to 4.6″ (58″ overall to ~35″ overall) -- approx 5x faster rn
This commit is contained in:
Mark Joshwel 2024-07-27 02:28:57 +08:00
parent b3ea2625d5
commit e4639b03df
2 changed files with 154 additions and 63 deletions

View file

@ -205,6 +205,24 @@ def one_sided(a: A, bbb: Iterable[B]) -> Iterator[OneSided[A, B]]:
yield OneSided(a, b) yield OneSided(a, b)
def generate_time_elapsed_string(time_taken: float) -> str:
"""generates a human-readable time-elapsed string from a time taken float"""
hours = int(time_taken // 3600)
minutes = int(time_taken % 3600 // 60)
seconds = int(time_taken % 60)
time_taken_string: str
if time_taken > 3600:
time_taken_string = f"{hours}h {minutes} {seconds}"
elif time_taken > 60:
time_taken_string = f"{minutes} {seconds}"
else:
time_taken_string = f"{time_taken:.2f}"
return time_taken_string
@dataclass(eq=True, frozen=True) @dataclass(eq=True, frozen=True)
class SideStepIgnoreMatcher: class SideStepIgnoreMatcher:
"""immutable gitignore matcher""" """immutable gitignore matcher"""
@ -234,7 +252,7 @@ class SideStepIgnoreMatcher:
root=self.root, rules=self.rules + ((gitignore.parent, tuple(new_ruleset)),) root=self.root, rules=self.rules + ((gitignore.parent, tuple(new_ruleset)),)
) )
def match(self, file: Path) -> bool: def match(self, file: Path | str) -> bool:
"""returns True if the file is ignored by any of the rules in the gitignore files, False otherwise""" """returns True if the file is ignored by any of the rules in the gitignore files, False otherwise"""
matched = False matched = False
@ -272,6 +290,24 @@ class SideStepIgnoreMatcher:
return any(rule.negation for rule in ruleset) return any(rule.negation for rule in ruleset)
@dataclass(eq=True, frozen=True)
class LargeFileFilterResult:
"""
result data structure of the large file filter
files: tuple[Path, ...]
large files found
matcher: SideStepIgnoreMatcher
the *ignore matcher instance
ignore_directories: tuple[Path, ...]
directories that were ignored
"""
files: tuple[Path, ...]
matcher: SideStepIgnoreMatcher
ignore_directories: tuple[Path, ...]
def _parallel() -> bool: def _parallel() -> bool:
""" """
helper function to determine if we should use multiprocessing; helper function to determine if we should use multiprocessing;
@ -311,7 +347,7 @@ def _iter_files(
yield target_file yield target_file
def iter_files(target_dir: Path) -> tuple[list[Path], SideStepIgnoreMatcher]: def iter_files(target_dir: Path) -> tuple[tuple[Path, ...], SideStepIgnoreMatcher]:
""" """
get all non-git files and register .gitignore files get all non-git files and register .gitignore files
@ -319,8 +355,8 @@ def iter_files(target_dir: Path) -> tuple[list[Path], SideStepIgnoreMatcher]:
target_dir: Path target_dir: Path
the directory to search in the directory to search in
returns: tuple[list[Path], SideStepIgnoreMatcher] returns: tuple[tuple[Path, ...], SideStepIgnoreMatcher]
list of all files in the target directory and a SideStepIgnoreMatcher instance tuple of all files in the target directory and a SideStepIgnoreMatcher instance
""" """
all_files: list[Path] = [] all_files: list[Path] = []
@ -335,7 +371,7 @@ def iter_files(target_dir: Path) -> tuple[list[Path], SideStepIgnoreMatcher]:
if file.name == ".gitignore": if file.name == ".gitignore":
sim = sim.add_gitignore(file) sim = sim.add_gitignore(file)
return all_files, sim return tuple(all_files), sim
def _filter_sim_match( def _filter_sim_match(
@ -372,9 +408,10 @@ def _filter_ign_dirs_and_size(os: OneSided[list[Path], Path]) -> Path | None:
return None return None
def _find_large_files_single(target: Path) -> list[Path]: def _find_large_files_single(
files: tuple[Path, ...], sim: SideStepIgnoreMatcher
) -> LargeFileFilterResult:
"""single-process implementation of find_large_files""" """single-process implementation of find_large_files"""
files, sim = iter_files(target)
ignore_dirs: list[Path] = [] ignore_dirs: list[Path] = []
_files = [] _files = []
@ -394,15 +431,21 @@ def _find_large_files_single(target: Path) -> list[Path]:
leave=False, leave=False,
total=len(_files), total=len(_files),
): ):
if f := _filter_ign_dirs_and_size(fds_os): f = _filter_ign_dirs_and_size(fds_os)
if f is not None:
large_files.append(f) large_files.append(f)
return large_files return LargeFileFilterResult(
files=tuple(large_files),
matcher=sim,
ignore_directories=tuple(ignore_dirs),
)
def _find_large_files_parallel(target: Path) -> list[Path]: def _find_large_files_parallel(
files: tuple[Path, ...], sim: SideStepIgnoreMatcher
) -> LargeFileFilterResult:
"""multiprocess implementation of find_large_files""" """multiprocess implementation of find_large_files"""
files, sim = iter_files(target)
manager = Manager() manager = Manager()
ignore_dirs: ListProxy[Path] = manager.list() ignore_dirs: ListProxy[Path] = manager.list()
@ -420,40 +463,51 @@ def _find_large_files_parallel(target: Path) -> list[Path]:
if f is not None if f is not None
] ]
return [ large_files: tuple[Path, ...] = tuple(
f [
for f in process_map( f
_filter_ign_dirs_and_size, for f in process_map(
one_sided(a=ignore_dirs, bbb=_files), _filter_ign_dirs_and_size,
desc="1 pre | finding large files - dir rematching (3/3)", one_sided(a=ignore_dirs, bbb=_files),
leave=False, desc="1 pre | finding large files - dir rematching (3/3)",
chunksize=SOTA_SIDESTEP_CHUNK_SIZE, leave=False,
max_workers=SOTA_SIDESTEP_MAX_WORKERS, chunksize=SOTA_SIDESTEP_CHUNK_SIZE,
total=len(files), max_workers=SOTA_SIDESTEP_MAX_WORKERS,
) total=len(files),
if f is not None )
] if f is not None
]
)
return LargeFileFilterResult(
files=large_files,
matcher=sim,
ignore_directories=tuple(ignore_dirs),
)
def find_large_files(target: Path) -> list[Path]: def find_large_files(
files: tuple[Path, ...], matcher: SideStepIgnoreMatcher
) -> LargeFileFilterResult:
""" """
finds all files larger than a certain size in a directory; finds all files larger than a certain size in a directory;
uses SOTA_SIDESTEP_LARGE_FILE_SIZE as the size threshold uses SOTA_SIDESTEP_LARGE_FILE_SIZE as the size threshold
args: args:
target_dir: Path files: tuple[Path, ...]
the directory to search in list of files to search through
matcher: SideStepIgnoreMatcher
the ignore matcher instance from iter_files()
returns: list[Path] returns: LargeFileFilterResult
list of large files
""" """
if _parallel(): if _parallel():
return _find_large_files_parallel(target) return _find_large_files_parallel(files, matcher)
else: else:
return _find_large_files_single(target) return _find_large_files_single(files, matcher)
def write_sotaignore(large_files: list[Path]) -> bool: def write_sotaignore(large_files: tuple[Path, ...]) -> bool:
""" """
writes out a .sotaignore file with a list of large files, writes out a .sotaignore file with a list of large files,
updating an existing one if already present updating an existing one if already present
@ -514,23 +568,35 @@ def main() -> None:
cumulative_start_time = time() cumulative_start_time = time()
print(f"1/2{INDENT}finding large files... ", end="", file=stderr) print(f"1/3{INDENT}pre-scanning repository... ", end="", file=stderr)
start_time = time() start_time = time()
large_files = find_large_files(REPO_DIR) files, sim = iter_files(REPO_DIR)
end_time = time() end_time = time()
print( print(
f"1/2{INDENT}finding large files... " f"1/3{INDENT}pre-scanning repository... "
f"done in {end_time - start_time:.2f}" f"done in {generate_time_elapsed_string(end_time - start_time)} "
f"(found {len(files)})",
file=stderr,
)
print(f"2/3{INDENT}finding large files... ", end="", file=stderr)
start_time = time()
large_files = find_large_files(files, sim).files
end_time = time()
print(
f"2/3{INDENT}finding large files... "
f"done in {generate_time_elapsed_string(end_time - start_time)} "
f"(found {len(large_files)})", f"(found {len(large_files)})",
file=stderr, file=stderr,
) )
print(f"2/2{INDENT}writing .sotaignore file... ", end="", file=stderr) print(f"3/3{INDENT}writing .sotaignore file... ", end="", file=stderr)
start_time = time() start_time = time()
was_written = write_sotaignore(large_files) was_written = write_sotaignore(large_files)
end_time = time() end_time = time()
print( print(
("done" if was_written else "skipped") + f" in {end_time - start_time:.2f}\n", ("done" if was_written else "skipped")
+ f" in {generate_time_elapsed_string(end_time - start_time)}\n",
file=stderr, file=stderr,
) )
@ -538,14 +604,9 @@ def main() -> None:
print(file.relative_to(REPO_DIR)) print(file.relative_to(REPO_DIR))
cumulative_end_time = time() cumulative_end_time = time()
time_taken = cumulative_end_time - cumulative_start_time
time_taken_string: str
if time_taken > 60:
time_taken_string = f"{int(time_taken // 60)}{int(time_taken % 60)}"
else:
time_taken_string = f"{time_taken:.2f}"
print( print(
f"\n--- done! took {time_taken_string}~ " "☆*: .。. o(≧▽≦)o .。.:*☆ ---", f"\n--- done! took {generate_time_elapsed_string(cumulative_end_time - cumulative_start_time)}~ "
"☆*: .。. o(≧▽≦)o .。.:*☆ ---",
flush=True, flush=True,
file=stderr, file=stderr,
) )

66
sync.py
View file

@ -17,8 +17,11 @@ from typing import Callable, Final, TypeVar
try: try:
from sidestepper import ( from sidestepper import (
SOTA_SIDESTEP_MAX_WORKERS, SOTA_SIDESTEP_MAX_WORKERS,
LargeFileFilterResult,
find_large_files, find_large_files,
generate_command_failure_message, generate_command_failure_message,
generate_time_elapsed_string,
iter_files,
run, run,
write_sotaignore, write_sotaignore,
) )
@ -54,7 +57,7 @@ class CopyHighway:
for use with shutil.copytree(); also displays a progress bar for use with shutil.copytree(); also displays a progress bar
""" """
def __init__(self, message: str, total: int): def __init__(self, message: str, total: int, lff_result: LargeFileFilterResult):
""" """
multithreaded file copying class that gives a copy2-like function multithreaded file copying class that gives a copy2-like function
for use with shutil.copytree() for use with shutil.copytree()
@ -64,6 +67,8 @@ class CopyHighway:
message to display in the progress bar message to display in the progress bar
total: int total: int
total number of files to copy total number of files to copy
lff_result: LargeFileFilterResult
result of the large file filter
""" """
self.pool = ThreadPool( self.pool = ThreadPool(
processes=SOTA_SIDESTEP_MAX_WORKERS, processes=SOTA_SIDESTEP_MAX_WORKERS,
@ -74,13 +79,27 @@ class CopyHighway:
unit=" files", unit=" files",
leave=False, leave=False,
) )
self.lff_result = lff_result
def callback(self, a: R): def callback(self, a: R):
self.pbar.update() self.pbar.update()
return a return a
def copy2(self, source: str, dest: str): def copy2(self, source: Path | str, dest: Path | str) -> None:
"""shutil.copy2()-like function for use with shutil.copytree()""" """shutil.copy2()-like function for use with shutil.copytree()"""
# ignore check 1: dir
for ign_dir in self.lff_result.ignore_directories:
if str(ign_dir) in str(source):
return None
# ignore check 2: file
# ... we don't need to use the trytrytry method
# ... because we already did that as part of the large file filter,
# ... and as such we checked for it with the first check above
if self.lff_result.matcher.match(source):
return None
self.pool.apply_async(copy2, args=(source, dest), callback=self.callback) self.pool.apply_async(copy2, args=(source, dest), callback=self.callback)
def __enter__(self): def __enter__(self):
@ -286,7 +305,10 @@ def step(
# yay # yay
if desc != "" and post_print: if desc != "" and post_print:
print(f" done in {end_time - start_time:.2f}", flush=True) print(
f" done in {generate_time_elapsed_string(end_time - start_time)}",
flush=True,
)
return rp return rp
@ -402,14 +424,17 @@ def main() -> None:
"critical error: repository is not clean, please commit changes first", "critical error: repository is not clean, please commit changes first",
) )
start_time = time()
print("1 pre | finding large files", end="", flush=True)
files, sim = iter_files(REPO_DIR)
if "--skipsotaignoregen" not in argv: if "--skipsotaignoregen" not in argv:
(print("1 pre | finding large files", end="", flush=True),) flf_filter_result = find_large_files(files, sim)
start_time = time() large_files = flf_filter_result.files
large_files = find_large_files(REPO_DIR)
end_time = time() end_time = time()
print( print(
"1 pre | finding large files... " "1 pre | finding large files... "
f"done in {end_time - start_time:.2f} (found {len(large_files)})" f"done in {generate_time_elapsed_string(end_time - start_time)} (found {len(large_files)})"
) )
if large_files: if large_files:
@ -422,15 +447,25 @@ def main() -> None:
) )
end_time = time() end_time = time()
if was_written: if was_written:
print(f" done in {end_time - start_time:.2f}") print(
f" done in {generate_time_elapsed_string(end_time - start_time)}"
)
else: else:
print(" not needed") print(" not needed")
else:
end_time = time()
print(
"1 pre | finding large files... "
f"skipped in {generate_time_elapsed_string(end_time - start_time)}"
)
print("3 pre | duplicating repo... pre-scanning", end="", flush=True) print("3 pre | duplicating repo... pre-scanning", end="", flush=True)
start_time = time() start_time = time()
with CopyHighway( with CopyHighway(
"3 pre | duplicating repo", total=len(list(REPO_DIR.rglob("*"))) message="3 pre | duplicating repo",
total=len(list(REPO_DIR.rglob("*"))),
lff_result=flf_filter_result,
) as copier: ) as copier:
copytree( copytree(
src=REPO_DIR, src=REPO_DIR,
@ -440,7 +475,7 @@ def main() -> None:
) )
end_time = time() end_time = time()
print( print(
f"3 pre | duplicating repo... done in {end_time - start_time:.2f}", f"3 pre | duplicating repo... done in {generate_time_elapsed_string(end_time - start_time)}",
flush=True, flush=True,
) )
@ -548,7 +583,7 @@ def main() -> None:
r["remote/github"] = "github" r["remote/github"] = "github"
step( step(
desc=f"X fin | pushing to github/{branch}", desc=f"X fin | pushing to {r['remote/github']}/{branch}",
func=cmd( func=cmd(
f"git push {r['remote/github']} {branch} --force" f"git push {r['remote/github']} {branch} --force"
if ("--test" not in argv) if ("--test" not in argv)
@ -557,14 +592,9 @@ def main() -> None:
) )
cumulative_end_time = time() cumulative_end_time = time()
time_taken = cumulative_end_time - cumulative_start_time
time_taken_string: str
if time_taken > 60:
time_taken_string = f"{int(time_taken // 60)}{int(time_taken % 60)}"
else:
time_taken_string = f"{time_taken:.2f}"
print( print(
f"\n--- done! took {time_taken_string}~ " "☆*: .。. o(≧▽≦)o .。.:*☆ ---", f"\n--- done! took {generate_time_elapsed_string(cumulative_end_time - cumulative_start_time)}~ "
"☆*: .。. o(≧▽≦)o .。.:*☆ ---",
flush=True, flush=True,
) )