tooling: faster repo dupe + std elapsed string gen
rewrite a few things on sidestepper so that we can get back the sim and ignored directories found by the large file finding algorithn (LargeFileFilterResult) from 23.5″ to 4.6″ (58″ overall to ~35″ overall) -- approx 5x faster rn
This commit is contained in:
parent
b3ea2625d5
commit
e4639b03df
151
sidestepper.py
151
sidestepper.py
|
@ -205,6 +205,24 @@ def one_sided(a: A, bbb: Iterable[B]) -> Iterator[OneSided[A, B]]:
|
|||
yield OneSided(a, b)
|
||||
|
||||
|
||||
def generate_time_elapsed_string(time_taken: float) -> str:
|
||||
"""generates a human-readable time-elapsed string from a time taken float"""
|
||||
hours = int(time_taken // 3600)
|
||||
minutes = int(time_taken % 3600 // 60)
|
||||
seconds = int(time_taken % 60)
|
||||
|
||||
time_taken_string: str
|
||||
|
||||
if time_taken > 3600:
|
||||
time_taken_string = f"{hours}h {minutes}′ {seconds}″"
|
||||
elif time_taken > 60:
|
||||
time_taken_string = f"{minutes}′ {seconds}″"
|
||||
else:
|
||||
time_taken_string = f"{time_taken:.2f}″"
|
||||
|
||||
return time_taken_string
|
||||
|
||||
|
||||
@dataclass(eq=True, frozen=True)
|
||||
class SideStepIgnoreMatcher:
|
||||
"""immutable gitignore matcher"""
|
||||
|
@ -234,7 +252,7 @@ class SideStepIgnoreMatcher:
|
|||
root=self.root, rules=self.rules + ((gitignore.parent, tuple(new_ruleset)),)
|
||||
)
|
||||
|
||||
def match(self, file: Path) -> bool:
|
||||
def match(self, file: Path | str) -> bool:
|
||||
"""returns True if the file is ignored by any of the rules in the gitignore files, False otherwise"""
|
||||
matched = False
|
||||
|
||||
|
@ -272,6 +290,24 @@ class SideStepIgnoreMatcher:
|
|||
return any(rule.negation for rule in ruleset)
|
||||
|
||||
|
||||
@dataclass(eq=True, frozen=True)
|
||||
class LargeFileFilterResult:
|
||||
"""
|
||||
result data structure of the large file filter
|
||||
|
||||
files: tuple[Path, ...]
|
||||
large files found
|
||||
matcher: SideStepIgnoreMatcher
|
||||
the *ignore matcher instance
|
||||
ignore_directories: tuple[Path, ...]
|
||||
directories that were ignored
|
||||
"""
|
||||
|
||||
files: tuple[Path, ...]
|
||||
matcher: SideStepIgnoreMatcher
|
||||
ignore_directories: tuple[Path, ...]
|
||||
|
||||
|
||||
def _parallel() -> bool:
|
||||
"""
|
||||
helper function to determine if we should use multiprocessing;
|
||||
|
@ -311,7 +347,7 @@ def _iter_files(
|
|||
yield target_file
|
||||
|
||||
|
||||
def iter_files(target_dir: Path) -> tuple[list[Path], SideStepIgnoreMatcher]:
|
||||
def iter_files(target_dir: Path) -> tuple[tuple[Path, ...], SideStepIgnoreMatcher]:
|
||||
"""
|
||||
get all non-git files and register .gitignore files
|
||||
|
||||
|
@ -319,8 +355,8 @@ def iter_files(target_dir: Path) -> tuple[list[Path], SideStepIgnoreMatcher]:
|
|||
target_dir: Path
|
||||
the directory to search in
|
||||
|
||||
returns: tuple[list[Path], SideStepIgnoreMatcher]
|
||||
list of all files in the target directory and a SideStepIgnoreMatcher instance
|
||||
returns: tuple[tuple[Path, ...], SideStepIgnoreMatcher]
|
||||
tuple of all files in the target directory and a SideStepIgnoreMatcher instance
|
||||
"""
|
||||
|
||||
all_files: list[Path] = []
|
||||
|
@ -335,7 +371,7 @@ def iter_files(target_dir: Path) -> tuple[list[Path], SideStepIgnoreMatcher]:
|
|||
if file.name == ".gitignore":
|
||||
sim = sim.add_gitignore(file)
|
||||
|
||||
return all_files, sim
|
||||
return tuple(all_files), sim
|
||||
|
||||
|
||||
def _filter_sim_match(
|
||||
|
@ -372,9 +408,10 @@ def _filter_ign_dirs_and_size(os: OneSided[list[Path], Path]) -> Path | None:
|
|||
return None
|
||||
|
||||
|
||||
def _find_large_files_single(target: Path) -> list[Path]:
|
||||
def _find_large_files_single(
|
||||
files: tuple[Path, ...], sim: SideStepIgnoreMatcher
|
||||
) -> LargeFileFilterResult:
|
||||
"""single-process implementation of find_large_files"""
|
||||
files, sim = iter_files(target)
|
||||
ignore_dirs: list[Path] = []
|
||||
|
||||
_files = []
|
||||
|
@ -394,15 +431,21 @@ def _find_large_files_single(target: Path) -> list[Path]:
|
|||
leave=False,
|
||||
total=len(_files),
|
||||
):
|
||||
if f := _filter_ign_dirs_and_size(fds_os):
|
||||
f = _filter_ign_dirs_and_size(fds_os)
|
||||
if f is not None:
|
||||
large_files.append(f)
|
||||
|
||||
return large_files
|
||||
return LargeFileFilterResult(
|
||||
files=tuple(large_files),
|
||||
matcher=sim,
|
||||
ignore_directories=tuple(ignore_dirs),
|
||||
)
|
||||
|
||||
|
||||
def _find_large_files_parallel(target: Path) -> list[Path]:
|
||||
def _find_large_files_parallel(
|
||||
files: tuple[Path, ...], sim: SideStepIgnoreMatcher
|
||||
) -> LargeFileFilterResult:
|
||||
"""multiprocess implementation of find_large_files"""
|
||||
files, sim = iter_files(target)
|
||||
manager = Manager()
|
||||
ignore_dirs: ListProxy[Path] = manager.list()
|
||||
|
||||
|
@ -420,40 +463,51 @@ def _find_large_files_parallel(target: Path) -> list[Path]:
|
|||
if f is not None
|
||||
]
|
||||
|
||||
return [
|
||||
f
|
||||
for f in process_map(
|
||||
_filter_ign_dirs_and_size,
|
||||
one_sided(a=ignore_dirs, bbb=_files),
|
||||
desc="1 pre | finding large files - dir rematching (3/3)",
|
||||
leave=False,
|
||||
chunksize=SOTA_SIDESTEP_CHUNK_SIZE,
|
||||
max_workers=SOTA_SIDESTEP_MAX_WORKERS,
|
||||
total=len(files),
|
||||
)
|
||||
if f is not None
|
||||
]
|
||||
large_files: tuple[Path, ...] = tuple(
|
||||
[
|
||||
f
|
||||
for f in process_map(
|
||||
_filter_ign_dirs_and_size,
|
||||
one_sided(a=ignore_dirs, bbb=_files),
|
||||
desc="1 pre | finding large files - dir rematching (3/3)",
|
||||
leave=False,
|
||||
chunksize=SOTA_SIDESTEP_CHUNK_SIZE,
|
||||
max_workers=SOTA_SIDESTEP_MAX_WORKERS,
|
||||
total=len(files),
|
||||
)
|
||||
if f is not None
|
||||
]
|
||||
)
|
||||
|
||||
return LargeFileFilterResult(
|
||||
files=large_files,
|
||||
matcher=sim,
|
||||
ignore_directories=tuple(ignore_dirs),
|
||||
)
|
||||
|
||||
|
||||
def find_large_files(target: Path) -> list[Path]:
|
||||
def find_large_files(
|
||||
files: tuple[Path, ...], matcher: SideStepIgnoreMatcher
|
||||
) -> LargeFileFilterResult:
|
||||
"""
|
||||
finds all files larger than a certain size in a directory;
|
||||
uses SOTA_SIDESTEP_LARGE_FILE_SIZE as the size threshold
|
||||
|
||||
args:
|
||||
target_dir: Path
|
||||
the directory to search in
|
||||
files: tuple[Path, ...]
|
||||
list of files to search through
|
||||
matcher: SideStepIgnoreMatcher
|
||||
the ignore matcher instance from iter_files()
|
||||
|
||||
returns: list[Path]
|
||||
list of large files
|
||||
returns: LargeFileFilterResult
|
||||
"""
|
||||
if _parallel():
|
||||
return _find_large_files_parallel(target)
|
||||
return _find_large_files_parallel(files, matcher)
|
||||
else:
|
||||
return _find_large_files_single(target)
|
||||
return _find_large_files_single(files, matcher)
|
||||
|
||||
|
||||
def write_sotaignore(large_files: list[Path]) -> bool:
|
||||
def write_sotaignore(large_files: tuple[Path, ...]) -> bool:
|
||||
"""
|
||||
writes out a .sotaignore file with a list of large files,
|
||||
updating an existing one if already present
|
||||
|
@ -514,23 +568,35 @@ def main() -> None:
|
|||
|
||||
cumulative_start_time = time()
|
||||
|
||||
print(f"1/2{INDENT}finding large files... ", end="", file=stderr)
|
||||
print(f"1/3{INDENT}pre-scanning repository... ", end="", file=stderr)
|
||||
start_time = time()
|
||||
large_files = find_large_files(REPO_DIR)
|
||||
files, sim = iter_files(REPO_DIR)
|
||||
end_time = time()
|
||||
print(
|
||||
f"1/2{INDENT}finding large files... "
|
||||
f"done in {end_time - start_time:.2f}″ "
|
||||
f"1/3{INDENT}pre-scanning repository... "
|
||||
f"done in {generate_time_elapsed_string(end_time - start_time)} "
|
||||
f"(found {len(files)})",
|
||||
file=stderr,
|
||||
)
|
||||
|
||||
print(f"2/3{INDENT}finding large files... ", end="", file=stderr)
|
||||
start_time = time()
|
||||
large_files = find_large_files(files, sim).files
|
||||
end_time = time()
|
||||
print(
|
||||
f"2/3{INDENT}finding large files... "
|
||||
f"done in {generate_time_elapsed_string(end_time - start_time)} "
|
||||
f"(found {len(large_files)})",
|
||||
file=stderr,
|
||||
)
|
||||
|
||||
print(f"2/2{INDENT}writing .sotaignore file... ", end="", file=stderr)
|
||||
print(f"3/3{INDENT}writing .sotaignore file... ", end="", file=stderr)
|
||||
start_time = time()
|
||||
was_written = write_sotaignore(large_files)
|
||||
end_time = time()
|
||||
print(
|
||||
("done" if was_written else "skipped") + f" in {end_time - start_time:.2f}″\n",
|
||||
("done" if was_written else "skipped")
|
||||
+ f" in {generate_time_elapsed_string(end_time - start_time)}\n",
|
||||
file=stderr,
|
||||
)
|
||||
|
||||
|
@ -538,14 +604,9 @@ def main() -> None:
|
|||
print(file.relative_to(REPO_DIR))
|
||||
|
||||
cumulative_end_time = time()
|
||||
time_taken = cumulative_end_time - cumulative_start_time
|
||||
time_taken_string: str
|
||||
if time_taken > 60:
|
||||
time_taken_string = f"{int(time_taken // 60)}′{int(time_taken % 60)}″"
|
||||
else:
|
||||
time_taken_string = f"{time_taken:.2f}″"
|
||||
print(
|
||||
f"\n--- done! took {time_taken_string}~ " "☆*: .。. o(≧▽≦)o .。.:*☆ ---",
|
||||
f"\n--- done! took {generate_time_elapsed_string(cumulative_end_time - cumulative_start_time)}~ "
|
||||
"☆*: .。. o(≧▽≦)o .。.:*☆ ---",
|
||||
flush=True,
|
||||
file=stderr,
|
||||
)
|
||||
|
|
66
sync.py
66
sync.py
|
@ -17,8 +17,11 @@ from typing import Callable, Final, TypeVar
|
|||
try:
|
||||
from sidestepper import (
|
||||
SOTA_SIDESTEP_MAX_WORKERS,
|
||||
LargeFileFilterResult,
|
||||
find_large_files,
|
||||
generate_command_failure_message,
|
||||
generate_time_elapsed_string,
|
||||
iter_files,
|
||||
run,
|
||||
write_sotaignore,
|
||||
)
|
||||
|
@ -54,7 +57,7 @@ class CopyHighway:
|
|||
for use with shutil.copytree(); also displays a progress bar
|
||||
"""
|
||||
|
||||
def __init__(self, message: str, total: int):
|
||||
def __init__(self, message: str, total: int, lff_result: LargeFileFilterResult):
|
||||
"""
|
||||
multithreaded file copying class that gives a copy2-like function
|
||||
for use with shutil.copytree()
|
||||
|
@ -64,6 +67,8 @@ class CopyHighway:
|
|||
message to display in the progress bar
|
||||
total: int
|
||||
total number of files to copy
|
||||
lff_result: LargeFileFilterResult
|
||||
result of the large file filter
|
||||
"""
|
||||
self.pool = ThreadPool(
|
||||
processes=SOTA_SIDESTEP_MAX_WORKERS,
|
||||
|
@ -74,13 +79,27 @@ class CopyHighway:
|
|||
unit=" files",
|
||||
leave=False,
|
||||
)
|
||||
self.lff_result = lff_result
|
||||
|
||||
def callback(self, a: R):
|
||||
self.pbar.update()
|
||||
return a
|
||||
|
||||
def copy2(self, source: str, dest: str):
|
||||
def copy2(self, source: Path | str, dest: Path | str) -> None:
|
||||
"""shutil.copy2()-like function for use with shutil.copytree()"""
|
||||
|
||||
# ignore check 1: dir
|
||||
for ign_dir in self.lff_result.ignore_directories:
|
||||
if str(ign_dir) in str(source):
|
||||
return None
|
||||
|
||||
# ignore check 2: file
|
||||
# ... we don't need to use the trytrytry method
|
||||
# ... because we already did that as part of the large file filter,
|
||||
# ... and as such we checked for it with the first check above
|
||||
if self.lff_result.matcher.match(source):
|
||||
return None
|
||||
|
||||
self.pool.apply_async(copy2, args=(source, dest), callback=self.callback)
|
||||
|
||||
def __enter__(self):
|
||||
|
@ -286,7 +305,10 @@ def step(
|
|||
|
||||
# yay
|
||||
if desc != "" and post_print:
|
||||
print(f" done in {end_time - start_time:.2f}″", flush=True)
|
||||
print(
|
||||
f" done in {generate_time_elapsed_string(end_time - start_time)}",
|
||||
flush=True,
|
||||
)
|
||||
|
||||
return rp
|
||||
|
||||
|
@ -402,14 +424,17 @@ def main() -> None:
|
|||
"critical error: repository is not clean, please commit changes first",
|
||||
)
|
||||
|
||||
start_time = time()
|
||||
print("1 pre | finding large files", end="", flush=True)
|
||||
files, sim = iter_files(REPO_DIR)
|
||||
|
||||
if "--skipsotaignoregen" not in argv:
|
||||
(print("1 pre | finding large files", end="", flush=True),)
|
||||
start_time = time()
|
||||
large_files = find_large_files(REPO_DIR)
|
||||
flf_filter_result = find_large_files(files, sim)
|
||||
large_files = flf_filter_result.files
|
||||
end_time = time()
|
||||
print(
|
||||
"1 pre | finding large files... "
|
||||
f"done in {end_time - start_time:.2f}″ (found {len(large_files)})"
|
||||
f"done in {generate_time_elapsed_string(end_time - start_time)} (found {len(large_files)})"
|
||||
)
|
||||
|
||||
if large_files:
|
||||
|
@ -422,15 +447,25 @@ def main() -> None:
|
|||
)
|
||||
end_time = time()
|
||||
if was_written:
|
||||
print(f" done in {end_time - start_time:.2f}″")
|
||||
print(
|
||||
f" done in {generate_time_elapsed_string(end_time - start_time)}"
|
||||
)
|
||||
else:
|
||||
print(" not needed")
|
||||
else:
|
||||
end_time = time()
|
||||
print(
|
||||
"1 pre | finding large files... "
|
||||
f"skipped in {generate_time_elapsed_string(end_time - start_time)}"
|
||||
)
|
||||
|
||||
print("3 pre | duplicating repo... pre-scanning", end="", flush=True)
|
||||
|
||||
start_time = time()
|
||||
with CopyHighway(
|
||||
"3 pre | duplicating repo", total=len(list(REPO_DIR.rglob("*")))
|
||||
message="3 pre | duplicating repo",
|
||||
total=len(list(REPO_DIR.rglob("*"))),
|
||||
lff_result=flf_filter_result,
|
||||
) as copier:
|
||||
copytree(
|
||||
src=REPO_DIR,
|
||||
|
@ -440,7 +475,7 @@ def main() -> None:
|
|||
)
|
||||
end_time = time()
|
||||
print(
|
||||
f"3 pre | duplicating repo... done in {end_time - start_time:.2f}″",
|
||||
f"3 pre | duplicating repo... done in {generate_time_elapsed_string(end_time - start_time)}",
|
||||
flush=True,
|
||||
)
|
||||
|
||||
|
@ -548,7 +583,7 @@ def main() -> None:
|
|||
r["remote/github"] = "github"
|
||||
|
||||
step(
|
||||
desc=f"X fin | pushing to github/{branch}",
|
||||
desc=f"X fin | pushing to {r['remote/github']}/{branch}",
|
||||
func=cmd(
|
||||
f"git push {r['remote/github']} {branch} --force"
|
||||
if ("--test" not in argv)
|
||||
|
@ -557,14 +592,9 @@ def main() -> None:
|
|||
)
|
||||
|
||||
cumulative_end_time = time()
|
||||
time_taken = cumulative_end_time - cumulative_start_time
|
||||
time_taken_string: str
|
||||
if time_taken > 60:
|
||||
time_taken_string = f"{int(time_taken // 60)}′{int(time_taken % 60)}″"
|
||||
else:
|
||||
time_taken_string = f"{time_taken:.2f}″"
|
||||
print(
|
||||
f"\n--- done! took {time_taken_string}~ " "☆*: .。. o(≧▽≦)o .。.:*☆ ---",
|
||||
f"\n--- done! took {generate_time_elapsed_string(cumulative_end_time - cumulative_start_time)}~ "
|
||||
"☆*: .。. o(≧▽≦)o .。.:*☆ ---",
|
||||
flush=True,
|
||||
)
|
||||
|
||||
|
|
Reference in a new issue