tooling: faster repo dupe + std elapsed string gen

rewrite a few things on sidestepper so that we can get back the sim
and ignored directories found by the large file finding algorithn (LargeFileFilterResult)

from 23.5″ to 4.6″ (58″ overall to ~35″ overall) -- approx 5x faster rn
This commit is contained in:
Mark Joshwel 2024-07-27 02:28:57 +08:00
parent b3ea2625d5
commit e4639b03df
2 changed files with 154 additions and 63 deletions

View file

@ -205,6 +205,24 @@ def one_sided(a: A, bbb: Iterable[B]) -> Iterator[OneSided[A, B]]:
yield OneSided(a, b)
def generate_time_elapsed_string(time_taken: float) -> str:
"""generates a human-readable time-elapsed string from a time taken float"""
hours = int(time_taken // 3600)
minutes = int(time_taken % 3600 // 60)
seconds = int(time_taken % 60)
time_taken_string: str
if time_taken > 3600:
time_taken_string = f"{hours}h {minutes} {seconds}"
elif time_taken > 60:
time_taken_string = f"{minutes} {seconds}"
else:
time_taken_string = f"{time_taken:.2f}"
return time_taken_string
@dataclass(eq=True, frozen=True)
class SideStepIgnoreMatcher:
"""immutable gitignore matcher"""
@ -234,7 +252,7 @@ class SideStepIgnoreMatcher:
root=self.root, rules=self.rules + ((gitignore.parent, tuple(new_ruleset)),)
)
def match(self, file: Path) -> bool:
def match(self, file: Path | str) -> bool:
"""returns True if the file is ignored by any of the rules in the gitignore files, False otherwise"""
matched = False
@ -272,6 +290,24 @@ class SideStepIgnoreMatcher:
return any(rule.negation for rule in ruleset)
@dataclass(eq=True, frozen=True)
class LargeFileFilterResult:
"""
result data structure of the large file filter
files: tuple[Path, ...]
large files found
matcher: SideStepIgnoreMatcher
the *ignore matcher instance
ignore_directories: tuple[Path, ...]
directories that were ignored
"""
files: tuple[Path, ...]
matcher: SideStepIgnoreMatcher
ignore_directories: tuple[Path, ...]
def _parallel() -> bool:
"""
helper function to determine if we should use multiprocessing;
@ -311,7 +347,7 @@ def _iter_files(
yield target_file
def iter_files(target_dir: Path) -> tuple[list[Path], SideStepIgnoreMatcher]:
def iter_files(target_dir: Path) -> tuple[tuple[Path, ...], SideStepIgnoreMatcher]:
"""
get all non-git files and register .gitignore files
@ -319,8 +355,8 @@ def iter_files(target_dir: Path) -> tuple[list[Path], SideStepIgnoreMatcher]:
target_dir: Path
the directory to search in
returns: tuple[list[Path], SideStepIgnoreMatcher]
list of all files in the target directory and a SideStepIgnoreMatcher instance
returns: tuple[tuple[Path, ...], SideStepIgnoreMatcher]
tuple of all files in the target directory and a SideStepIgnoreMatcher instance
"""
all_files: list[Path] = []
@ -335,7 +371,7 @@ def iter_files(target_dir: Path) -> tuple[list[Path], SideStepIgnoreMatcher]:
if file.name == ".gitignore":
sim = sim.add_gitignore(file)
return all_files, sim
return tuple(all_files), sim
def _filter_sim_match(
@ -372,9 +408,10 @@ def _filter_ign_dirs_and_size(os: OneSided[list[Path], Path]) -> Path | None:
return None
def _find_large_files_single(target: Path) -> list[Path]:
def _find_large_files_single(
files: tuple[Path, ...], sim: SideStepIgnoreMatcher
) -> LargeFileFilterResult:
"""single-process implementation of find_large_files"""
files, sim = iter_files(target)
ignore_dirs: list[Path] = []
_files = []
@ -394,15 +431,21 @@ def _find_large_files_single(target: Path) -> list[Path]:
leave=False,
total=len(_files),
):
if f := _filter_ign_dirs_and_size(fds_os):
f = _filter_ign_dirs_and_size(fds_os)
if f is not None:
large_files.append(f)
return large_files
return LargeFileFilterResult(
files=tuple(large_files),
matcher=sim,
ignore_directories=tuple(ignore_dirs),
)
def _find_large_files_parallel(target: Path) -> list[Path]:
def _find_large_files_parallel(
files: tuple[Path, ...], sim: SideStepIgnoreMatcher
) -> LargeFileFilterResult:
"""multiprocess implementation of find_large_files"""
files, sim = iter_files(target)
manager = Manager()
ignore_dirs: ListProxy[Path] = manager.list()
@ -420,40 +463,51 @@ def _find_large_files_parallel(target: Path) -> list[Path]:
if f is not None
]
return [
f
for f in process_map(
_filter_ign_dirs_and_size,
one_sided(a=ignore_dirs, bbb=_files),
desc="1 pre | finding large files - dir rematching (3/3)",
leave=False,
chunksize=SOTA_SIDESTEP_CHUNK_SIZE,
max_workers=SOTA_SIDESTEP_MAX_WORKERS,
total=len(files),
)
if f is not None
]
large_files: tuple[Path, ...] = tuple(
[
f
for f in process_map(
_filter_ign_dirs_and_size,
one_sided(a=ignore_dirs, bbb=_files),
desc="1 pre | finding large files - dir rematching (3/3)",
leave=False,
chunksize=SOTA_SIDESTEP_CHUNK_SIZE,
max_workers=SOTA_SIDESTEP_MAX_WORKERS,
total=len(files),
)
if f is not None
]
)
return LargeFileFilterResult(
files=large_files,
matcher=sim,
ignore_directories=tuple(ignore_dirs),
)
def find_large_files(target: Path) -> list[Path]:
def find_large_files(
files: tuple[Path, ...], matcher: SideStepIgnoreMatcher
) -> LargeFileFilterResult:
"""
finds all files larger than a certain size in a directory;
uses SOTA_SIDESTEP_LARGE_FILE_SIZE as the size threshold
args:
target_dir: Path
the directory to search in
files: tuple[Path, ...]
list of files to search through
matcher: SideStepIgnoreMatcher
the ignore matcher instance from iter_files()
returns: list[Path]
list of large files
returns: LargeFileFilterResult
"""
if _parallel():
return _find_large_files_parallel(target)
return _find_large_files_parallel(files, matcher)
else:
return _find_large_files_single(target)
return _find_large_files_single(files, matcher)
def write_sotaignore(large_files: list[Path]) -> bool:
def write_sotaignore(large_files: tuple[Path, ...]) -> bool:
"""
writes out a .sotaignore file with a list of large files,
updating an existing one if already present
@ -514,23 +568,35 @@ def main() -> None:
cumulative_start_time = time()
print(f"1/2{INDENT}finding large files... ", end="", file=stderr)
print(f"1/3{INDENT}pre-scanning repository... ", end="", file=stderr)
start_time = time()
large_files = find_large_files(REPO_DIR)
files, sim = iter_files(REPO_DIR)
end_time = time()
print(
f"1/2{INDENT}finding large files... "
f"done in {end_time - start_time:.2f}"
f"1/3{INDENT}pre-scanning repository... "
f"done in {generate_time_elapsed_string(end_time - start_time)} "
f"(found {len(files)})",
file=stderr,
)
print(f"2/3{INDENT}finding large files... ", end="", file=stderr)
start_time = time()
large_files = find_large_files(files, sim).files
end_time = time()
print(
f"2/3{INDENT}finding large files... "
f"done in {generate_time_elapsed_string(end_time - start_time)} "
f"(found {len(large_files)})",
file=stderr,
)
print(f"2/2{INDENT}writing .sotaignore file... ", end="", file=stderr)
print(f"3/3{INDENT}writing .sotaignore file... ", end="", file=stderr)
start_time = time()
was_written = write_sotaignore(large_files)
end_time = time()
print(
("done" if was_written else "skipped") + f" in {end_time - start_time:.2f}\n",
("done" if was_written else "skipped")
+ f" in {generate_time_elapsed_string(end_time - start_time)}\n",
file=stderr,
)
@ -538,14 +604,9 @@ def main() -> None:
print(file.relative_to(REPO_DIR))
cumulative_end_time = time()
time_taken = cumulative_end_time - cumulative_start_time
time_taken_string: str
if time_taken > 60:
time_taken_string = f"{int(time_taken // 60)}{int(time_taken % 60)}"
else:
time_taken_string = f"{time_taken:.2f}"
print(
f"\n--- done! took {time_taken_string}~ " "☆*: .。. o(≧▽≦)o .。.:*☆ ---",
f"\n--- done! took {generate_time_elapsed_string(cumulative_end_time - cumulative_start_time)}~ "
"☆*: .。. o(≧▽≦)o .。.:*☆ ---",
flush=True,
file=stderr,
)

66
sync.py
View file

@ -17,8 +17,11 @@ from typing import Callable, Final, TypeVar
try:
from sidestepper import (
SOTA_SIDESTEP_MAX_WORKERS,
LargeFileFilterResult,
find_large_files,
generate_command_failure_message,
generate_time_elapsed_string,
iter_files,
run,
write_sotaignore,
)
@ -54,7 +57,7 @@ class CopyHighway:
for use with shutil.copytree(); also displays a progress bar
"""
def __init__(self, message: str, total: int):
def __init__(self, message: str, total: int, lff_result: LargeFileFilterResult):
"""
multithreaded file copying class that gives a copy2-like function
for use with shutil.copytree()
@ -64,6 +67,8 @@ class CopyHighway:
message to display in the progress bar
total: int
total number of files to copy
lff_result: LargeFileFilterResult
result of the large file filter
"""
self.pool = ThreadPool(
processes=SOTA_SIDESTEP_MAX_WORKERS,
@ -74,13 +79,27 @@ class CopyHighway:
unit=" files",
leave=False,
)
self.lff_result = lff_result
def callback(self, a: R):
self.pbar.update()
return a
def copy2(self, source: str, dest: str):
def copy2(self, source: Path | str, dest: Path | str) -> None:
"""shutil.copy2()-like function for use with shutil.copytree()"""
# ignore check 1: dir
for ign_dir in self.lff_result.ignore_directories:
if str(ign_dir) in str(source):
return None
# ignore check 2: file
# ... we don't need to use the trytrytry method
# ... because we already did that as part of the large file filter,
# ... and as such we checked for it with the first check above
if self.lff_result.matcher.match(source):
return None
self.pool.apply_async(copy2, args=(source, dest), callback=self.callback)
def __enter__(self):
@ -286,7 +305,10 @@ def step(
# yay
if desc != "" and post_print:
print(f" done in {end_time - start_time:.2f}", flush=True)
print(
f" done in {generate_time_elapsed_string(end_time - start_time)}",
flush=True,
)
return rp
@ -402,14 +424,17 @@ def main() -> None:
"critical error: repository is not clean, please commit changes first",
)
start_time = time()
print("1 pre | finding large files", end="", flush=True)
files, sim = iter_files(REPO_DIR)
if "--skipsotaignoregen" not in argv:
(print("1 pre | finding large files", end="", flush=True),)
start_time = time()
large_files = find_large_files(REPO_DIR)
flf_filter_result = find_large_files(files, sim)
large_files = flf_filter_result.files
end_time = time()
print(
"1 pre | finding large files... "
f"done in {end_time - start_time:.2f} (found {len(large_files)})"
f"done in {generate_time_elapsed_string(end_time - start_time)} (found {len(large_files)})"
)
if large_files:
@ -422,15 +447,25 @@ def main() -> None:
)
end_time = time()
if was_written:
print(f" done in {end_time - start_time:.2f}")
print(
f" done in {generate_time_elapsed_string(end_time - start_time)}"
)
else:
print(" not needed")
else:
end_time = time()
print(
"1 pre | finding large files... "
f"skipped in {generate_time_elapsed_string(end_time - start_time)}"
)
print("3 pre | duplicating repo... pre-scanning", end="", flush=True)
start_time = time()
with CopyHighway(
"3 pre | duplicating repo", total=len(list(REPO_DIR.rglob("*")))
message="3 pre | duplicating repo",
total=len(list(REPO_DIR.rglob("*"))),
lff_result=flf_filter_result,
) as copier:
copytree(
src=REPO_DIR,
@ -440,7 +475,7 @@ def main() -> None:
)
end_time = time()
print(
f"3 pre | duplicating repo... done in {end_time - start_time:.2f}",
f"3 pre | duplicating repo... done in {generate_time_elapsed_string(end_time - start_time)}",
flush=True,
)
@ -548,7 +583,7 @@ def main() -> None:
r["remote/github"] = "github"
step(
desc=f"X fin | pushing to github/{branch}",
desc=f"X fin | pushing to {r['remote/github']}/{branch}",
func=cmd(
f"git push {r['remote/github']} {branch} --force"
if ("--test" not in argv)
@ -557,14 +592,9 @@ def main() -> None:
)
cumulative_end_time = time()
time_taken = cumulative_end_time - cumulative_start_time
time_taken_string: str
if time_taken > 60:
time_taken_string = f"{int(time_taken // 60)}{int(time_taken % 60)}"
else:
time_taken_string = f"{time_taken:.2f}"
print(
f"\n--- done! took {time_taken_string}~ " "☆*: .。. o(≧▽≦)o .。.:*☆ ---",
f"\n--- done! took {generate_time_elapsed_string(cumulative_end_time - cumulative_start_time)}~ "
"☆*: .。. o(≧▽≦)o .。.:*☆ ---",
flush=True,
)