tooling: faster repo dupe + std elapsed string gen
rewrite a few things on sidestepper so that we can get back the sim and ignored directories found by the large file finding algorithn (LargeFileFilterResult) from 23.5″ to 4.6″ (58″ overall to ~35″ overall) -- approx 5x faster rn
This commit is contained in:
parent
b3ea2625d5
commit
e4639b03df
2 changed files with 154 additions and 63 deletions
151
sidestepper.py
151
sidestepper.py
|
@ -205,6 +205,24 @@ def one_sided(a: A, bbb: Iterable[B]) -> Iterator[OneSided[A, B]]:
|
||||||
yield OneSided(a, b)
|
yield OneSided(a, b)
|
||||||
|
|
||||||
|
|
||||||
|
def generate_time_elapsed_string(time_taken: float) -> str:
|
||||||
|
"""generates a human-readable time-elapsed string from a time taken float"""
|
||||||
|
hours = int(time_taken // 3600)
|
||||||
|
minutes = int(time_taken % 3600 // 60)
|
||||||
|
seconds = int(time_taken % 60)
|
||||||
|
|
||||||
|
time_taken_string: str
|
||||||
|
|
||||||
|
if time_taken > 3600:
|
||||||
|
time_taken_string = f"{hours}h {minutes}′ {seconds}″"
|
||||||
|
elif time_taken > 60:
|
||||||
|
time_taken_string = f"{minutes}′ {seconds}″"
|
||||||
|
else:
|
||||||
|
time_taken_string = f"{time_taken:.2f}″"
|
||||||
|
|
||||||
|
return time_taken_string
|
||||||
|
|
||||||
|
|
||||||
@dataclass(eq=True, frozen=True)
|
@dataclass(eq=True, frozen=True)
|
||||||
class SideStepIgnoreMatcher:
|
class SideStepIgnoreMatcher:
|
||||||
"""immutable gitignore matcher"""
|
"""immutable gitignore matcher"""
|
||||||
|
@ -234,7 +252,7 @@ class SideStepIgnoreMatcher:
|
||||||
root=self.root, rules=self.rules + ((gitignore.parent, tuple(new_ruleset)),)
|
root=self.root, rules=self.rules + ((gitignore.parent, tuple(new_ruleset)),)
|
||||||
)
|
)
|
||||||
|
|
||||||
def match(self, file: Path) -> bool:
|
def match(self, file: Path | str) -> bool:
|
||||||
"""returns True if the file is ignored by any of the rules in the gitignore files, False otherwise"""
|
"""returns True if the file is ignored by any of the rules in the gitignore files, False otherwise"""
|
||||||
matched = False
|
matched = False
|
||||||
|
|
||||||
|
@ -272,6 +290,24 @@ class SideStepIgnoreMatcher:
|
||||||
return any(rule.negation for rule in ruleset)
|
return any(rule.negation for rule in ruleset)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(eq=True, frozen=True)
|
||||||
|
class LargeFileFilterResult:
|
||||||
|
"""
|
||||||
|
result data structure of the large file filter
|
||||||
|
|
||||||
|
files: tuple[Path, ...]
|
||||||
|
large files found
|
||||||
|
matcher: SideStepIgnoreMatcher
|
||||||
|
the *ignore matcher instance
|
||||||
|
ignore_directories: tuple[Path, ...]
|
||||||
|
directories that were ignored
|
||||||
|
"""
|
||||||
|
|
||||||
|
files: tuple[Path, ...]
|
||||||
|
matcher: SideStepIgnoreMatcher
|
||||||
|
ignore_directories: tuple[Path, ...]
|
||||||
|
|
||||||
|
|
||||||
def _parallel() -> bool:
|
def _parallel() -> bool:
|
||||||
"""
|
"""
|
||||||
helper function to determine if we should use multiprocessing;
|
helper function to determine if we should use multiprocessing;
|
||||||
|
@ -311,7 +347,7 @@ def _iter_files(
|
||||||
yield target_file
|
yield target_file
|
||||||
|
|
||||||
|
|
||||||
def iter_files(target_dir: Path) -> tuple[list[Path], SideStepIgnoreMatcher]:
|
def iter_files(target_dir: Path) -> tuple[tuple[Path, ...], SideStepIgnoreMatcher]:
|
||||||
"""
|
"""
|
||||||
get all non-git files and register .gitignore files
|
get all non-git files and register .gitignore files
|
||||||
|
|
||||||
|
@ -319,8 +355,8 @@ def iter_files(target_dir: Path) -> tuple[list[Path], SideStepIgnoreMatcher]:
|
||||||
target_dir: Path
|
target_dir: Path
|
||||||
the directory to search in
|
the directory to search in
|
||||||
|
|
||||||
returns: tuple[list[Path], SideStepIgnoreMatcher]
|
returns: tuple[tuple[Path, ...], SideStepIgnoreMatcher]
|
||||||
list of all files in the target directory and a SideStepIgnoreMatcher instance
|
tuple of all files in the target directory and a SideStepIgnoreMatcher instance
|
||||||
"""
|
"""
|
||||||
|
|
||||||
all_files: list[Path] = []
|
all_files: list[Path] = []
|
||||||
|
@ -335,7 +371,7 @@ def iter_files(target_dir: Path) -> tuple[list[Path], SideStepIgnoreMatcher]:
|
||||||
if file.name == ".gitignore":
|
if file.name == ".gitignore":
|
||||||
sim = sim.add_gitignore(file)
|
sim = sim.add_gitignore(file)
|
||||||
|
|
||||||
return all_files, sim
|
return tuple(all_files), sim
|
||||||
|
|
||||||
|
|
||||||
def _filter_sim_match(
|
def _filter_sim_match(
|
||||||
|
@ -372,9 +408,10 @@ def _filter_ign_dirs_and_size(os: OneSided[list[Path], Path]) -> Path | None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _find_large_files_single(target: Path) -> list[Path]:
|
def _find_large_files_single(
|
||||||
|
files: tuple[Path, ...], sim: SideStepIgnoreMatcher
|
||||||
|
) -> LargeFileFilterResult:
|
||||||
"""single-process implementation of find_large_files"""
|
"""single-process implementation of find_large_files"""
|
||||||
files, sim = iter_files(target)
|
|
||||||
ignore_dirs: list[Path] = []
|
ignore_dirs: list[Path] = []
|
||||||
|
|
||||||
_files = []
|
_files = []
|
||||||
|
@ -394,15 +431,21 @@ def _find_large_files_single(target: Path) -> list[Path]:
|
||||||
leave=False,
|
leave=False,
|
||||||
total=len(_files),
|
total=len(_files),
|
||||||
):
|
):
|
||||||
if f := _filter_ign_dirs_and_size(fds_os):
|
f = _filter_ign_dirs_and_size(fds_os)
|
||||||
|
if f is not None:
|
||||||
large_files.append(f)
|
large_files.append(f)
|
||||||
|
|
||||||
return large_files
|
return LargeFileFilterResult(
|
||||||
|
files=tuple(large_files),
|
||||||
|
matcher=sim,
|
||||||
|
ignore_directories=tuple(ignore_dirs),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _find_large_files_parallel(target: Path) -> list[Path]:
|
def _find_large_files_parallel(
|
||||||
|
files: tuple[Path, ...], sim: SideStepIgnoreMatcher
|
||||||
|
) -> LargeFileFilterResult:
|
||||||
"""multiprocess implementation of find_large_files"""
|
"""multiprocess implementation of find_large_files"""
|
||||||
files, sim = iter_files(target)
|
|
||||||
manager = Manager()
|
manager = Manager()
|
||||||
ignore_dirs: ListProxy[Path] = manager.list()
|
ignore_dirs: ListProxy[Path] = manager.list()
|
||||||
|
|
||||||
|
@ -420,40 +463,51 @@ def _find_large_files_parallel(target: Path) -> list[Path]:
|
||||||
if f is not None
|
if f is not None
|
||||||
]
|
]
|
||||||
|
|
||||||
return [
|
large_files: tuple[Path, ...] = tuple(
|
||||||
f
|
[
|
||||||
for f in process_map(
|
f
|
||||||
_filter_ign_dirs_and_size,
|
for f in process_map(
|
||||||
one_sided(a=ignore_dirs, bbb=_files),
|
_filter_ign_dirs_and_size,
|
||||||
desc="1 pre | finding large files - dir rematching (3/3)",
|
one_sided(a=ignore_dirs, bbb=_files),
|
||||||
leave=False,
|
desc="1 pre | finding large files - dir rematching (3/3)",
|
||||||
chunksize=SOTA_SIDESTEP_CHUNK_SIZE,
|
leave=False,
|
||||||
max_workers=SOTA_SIDESTEP_MAX_WORKERS,
|
chunksize=SOTA_SIDESTEP_CHUNK_SIZE,
|
||||||
total=len(files),
|
max_workers=SOTA_SIDESTEP_MAX_WORKERS,
|
||||||
)
|
total=len(files),
|
||||||
if f is not None
|
)
|
||||||
]
|
if f is not None
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
return LargeFileFilterResult(
|
||||||
|
files=large_files,
|
||||||
|
matcher=sim,
|
||||||
|
ignore_directories=tuple(ignore_dirs),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def find_large_files(target: Path) -> list[Path]:
|
def find_large_files(
|
||||||
|
files: tuple[Path, ...], matcher: SideStepIgnoreMatcher
|
||||||
|
) -> LargeFileFilterResult:
|
||||||
"""
|
"""
|
||||||
finds all files larger than a certain size in a directory;
|
finds all files larger than a certain size in a directory;
|
||||||
uses SOTA_SIDESTEP_LARGE_FILE_SIZE as the size threshold
|
uses SOTA_SIDESTEP_LARGE_FILE_SIZE as the size threshold
|
||||||
|
|
||||||
args:
|
args:
|
||||||
target_dir: Path
|
files: tuple[Path, ...]
|
||||||
the directory to search in
|
list of files to search through
|
||||||
|
matcher: SideStepIgnoreMatcher
|
||||||
|
the ignore matcher instance from iter_files()
|
||||||
|
|
||||||
returns: list[Path]
|
returns: LargeFileFilterResult
|
||||||
list of large files
|
|
||||||
"""
|
"""
|
||||||
if _parallel():
|
if _parallel():
|
||||||
return _find_large_files_parallel(target)
|
return _find_large_files_parallel(files, matcher)
|
||||||
else:
|
else:
|
||||||
return _find_large_files_single(target)
|
return _find_large_files_single(files, matcher)
|
||||||
|
|
||||||
|
|
||||||
def write_sotaignore(large_files: list[Path]) -> bool:
|
def write_sotaignore(large_files: tuple[Path, ...]) -> bool:
|
||||||
"""
|
"""
|
||||||
writes out a .sotaignore file with a list of large files,
|
writes out a .sotaignore file with a list of large files,
|
||||||
updating an existing one if already present
|
updating an existing one if already present
|
||||||
|
@ -514,23 +568,35 @@ def main() -> None:
|
||||||
|
|
||||||
cumulative_start_time = time()
|
cumulative_start_time = time()
|
||||||
|
|
||||||
print(f"1/2{INDENT}finding large files... ", end="", file=stderr)
|
print(f"1/3{INDENT}pre-scanning repository... ", end="", file=stderr)
|
||||||
start_time = time()
|
start_time = time()
|
||||||
large_files = find_large_files(REPO_DIR)
|
files, sim = iter_files(REPO_DIR)
|
||||||
end_time = time()
|
end_time = time()
|
||||||
print(
|
print(
|
||||||
f"1/2{INDENT}finding large files... "
|
f"1/3{INDENT}pre-scanning repository... "
|
||||||
f"done in {end_time - start_time:.2f}″ "
|
f"done in {generate_time_elapsed_string(end_time - start_time)} "
|
||||||
|
f"(found {len(files)})",
|
||||||
|
file=stderr,
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"2/3{INDENT}finding large files... ", end="", file=stderr)
|
||||||
|
start_time = time()
|
||||||
|
large_files = find_large_files(files, sim).files
|
||||||
|
end_time = time()
|
||||||
|
print(
|
||||||
|
f"2/3{INDENT}finding large files... "
|
||||||
|
f"done in {generate_time_elapsed_string(end_time - start_time)} "
|
||||||
f"(found {len(large_files)})",
|
f"(found {len(large_files)})",
|
||||||
file=stderr,
|
file=stderr,
|
||||||
)
|
)
|
||||||
|
|
||||||
print(f"2/2{INDENT}writing .sotaignore file... ", end="", file=stderr)
|
print(f"3/3{INDENT}writing .sotaignore file... ", end="", file=stderr)
|
||||||
start_time = time()
|
start_time = time()
|
||||||
was_written = write_sotaignore(large_files)
|
was_written = write_sotaignore(large_files)
|
||||||
end_time = time()
|
end_time = time()
|
||||||
print(
|
print(
|
||||||
("done" if was_written else "skipped") + f" in {end_time - start_time:.2f}″\n",
|
("done" if was_written else "skipped")
|
||||||
|
+ f" in {generate_time_elapsed_string(end_time - start_time)}\n",
|
||||||
file=stderr,
|
file=stderr,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -538,14 +604,9 @@ def main() -> None:
|
||||||
print(file.relative_to(REPO_DIR))
|
print(file.relative_to(REPO_DIR))
|
||||||
|
|
||||||
cumulative_end_time = time()
|
cumulative_end_time = time()
|
||||||
time_taken = cumulative_end_time - cumulative_start_time
|
|
||||||
time_taken_string: str
|
|
||||||
if time_taken > 60:
|
|
||||||
time_taken_string = f"{int(time_taken // 60)}′{int(time_taken % 60)}″"
|
|
||||||
else:
|
|
||||||
time_taken_string = f"{time_taken:.2f}″"
|
|
||||||
print(
|
print(
|
||||||
f"\n--- done! took {time_taken_string}~ " "☆*: .。. o(≧▽≦)o .。.:*☆ ---",
|
f"\n--- done! took {generate_time_elapsed_string(cumulative_end_time - cumulative_start_time)}~ "
|
||||||
|
"☆*: .。. o(≧▽≦)o .。.:*☆ ---",
|
||||||
flush=True,
|
flush=True,
|
||||||
file=stderr,
|
file=stderr,
|
||||||
)
|
)
|
||||||
|
|
66
sync.py
66
sync.py
|
@ -17,8 +17,11 @@ from typing import Callable, Final, TypeVar
|
||||||
try:
|
try:
|
||||||
from sidestepper import (
|
from sidestepper import (
|
||||||
SOTA_SIDESTEP_MAX_WORKERS,
|
SOTA_SIDESTEP_MAX_WORKERS,
|
||||||
|
LargeFileFilterResult,
|
||||||
find_large_files,
|
find_large_files,
|
||||||
generate_command_failure_message,
|
generate_command_failure_message,
|
||||||
|
generate_time_elapsed_string,
|
||||||
|
iter_files,
|
||||||
run,
|
run,
|
||||||
write_sotaignore,
|
write_sotaignore,
|
||||||
)
|
)
|
||||||
|
@ -54,7 +57,7 @@ class CopyHighway:
|
||||||
for use with shutil.copytree(); also displays a progress bar
|
for use with shutil.copytree(); also displays a progress bar
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, message: str, total: int):
|
def __init__(self, message: str, total: int, lff_result: LargeFileFilterResult):
|
||||||
"""
|
"""
|
||||||
multithreaded file copying class that gives a copy2-like function
|
multithreaded file copying class that gives a copy2-like function
|
||||||
for use with shutil.copytree()
|
for use with shutil.copytree()
|
||||||
|
@ -64,6 +67,8 @@ class CopyHighway:
|
||||||
message to display in the progress bar
|
message to display in the progress bar
|
||||||
total: int
|
total: int
|
||||||
total number of files to copy
|
total number of files to copy
|
||||||
|
lff_result: LargeFileFilterResult
|
||||||
|
result of the large file filter
|
||||||
"""
|
"""
|
||||||
self.pool = ThreadPool(
|
self.pool = ThreadPool(
|
||||||
processes=SOTA_SIDESTEP_MAX_WORKERS,
|
processes=SOTA_SIDESTEP_MAX_WORKERS,
|
||||||
|
@ -74,13 +79,27 @@ class CopyHighway:
|
||||||
unit=" files",
|
unit=" files",
|
||||||
leave=False,
|
leave=False,
|
||||||
)
|
)
|
||||||
|
self.lff_result = lff_result
|
||||||
|
|
||||||
def callback(self, a: R):
|
def callback(self, a: R):
|
||||||
self.pbar.update()
|
self.pbar.update()
|
||||||
return a
|
return a
|
||||||
|
|
||||||
def copy2(self, source: str, dest: str):
|
def copy2(self, source: Path | str, dest: Path | str) -> None:
|
||||||
"""shutil.copy2()-like function for use with shutil.copytree()"""
|
"""shutil.copy2()-like function for use with shutil.copytree()"""
|
||||||
|
|
||||||
|
# ignore check 1: dir
|
||||||
|
for ign_dir in self.lff_result.ignore_directories:
|
||||||
|
if str(ign_dir) in str(source):
|
||||||
|
return None
|
||||||
|
|
||||||
|
# ignore check 2: file
|
||||||
|
# ... we don't need to use the trytrytry method
|
||||||
|
# ... because we already did that as part of the large file filter,
|
||||||
|
# ... and as such we checked for it with the first check above
|
||||||
|
if self.lff_result.matcher.match(source):
|
||||||
|
return None
|
||||||
|
|
||||||
self.pool.apply_async(copy2, args=(source, dest), callback=self.callback)
|
self.pool.apply_async(copy2, args=(source, dest), callback=self.callback)
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self):
|
||||||
|
@ -286,7 +305,10 @@ def step(
|
||||||
|
|
||||||
# yay
|
# yay
|
||||||
if desc != "" and post_print:
|
if desc != "" and post_print:
|
||||||
print(f" done in {end_time - start_time:.2f}″", flush=True)
|
print(
|
||||||
|
f" done in {generate_time_elapsed_string(end_time - start_time)}",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
|
||||||
return rp
|
return rp
|
||||||
|
|
||||||
|
@ -402,14 +424,17 @@ def main() -> None:
|
||||||
"critical error: repository is not clean, please commit changes first",
|
"critical error: repository is not clean, please commit changes first",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
start_time = time()
|
||||||
|
print("1 pre | finding large files", end="", flush=True)
|
||||||
|
files, sim = iter_files(REPO_DIR)
|
||||||
|
|
||||||
if "--skipsotaignoregen" not in argv:
|
if "--skipsotaignoregen" not in argv:
|
||||||
(print("1 pre | finding large files", end="", flush=True),)
|
flf_filter_result = find_large_files(files, sim)
|
||||||
start_time = time()
|
large_files = flf_filter_result.files
|
||||||
large_files = find_large_files(REPO_DIR)
|
|
||||||
end_time = time()
|
end_time = time()
|
||||||
print(
|
print(
|
||||||
"1 pre | finding large files... "
|
"1 pre | finding large files... "
|
||||||
f"done in {end_time - start_time:.2f}″ (found {len(large_files)})"
|
f"done in {generate_time_elapsed_string(end_time - start_time)} (found {len(large_files)})"
|
||||||
)
|
)
|
||||||
|
|
||||||
if large_files:
|
if large_files:
|
||||||
|
@ -422,15 +447,25 @@ def main() -> None:
|
||||||
)
|
)
|
||||||
end_time = time()
|
end_time = time()
|
||||||
if was_written:
|
if was_written:
|
||||||
print(f" done in {end_time - start_time:.2f}″")
|
print(
|
||||||
|
f" done in {generate_time_elapsed_string(end_time - start_time)}"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
print(" not needed")
|
print(" not needed")
|
||||||
|
else:
|
||||||
|
end_time = time()
|
||||||
|
print(
|
||||||
|
"1 pre | finding large files... "
|
||||||
|
f"skipped in {generate_time_elapsed_string(end_time - start_time)}"
|
||||||
|
)
|
||||||
|
|
||||||
print("3 pre | duplicating repo... pre-scanning", end="", flush=True)
|
print("3 pre | duplicating repo... pre-scanning", end="", flush=True)
|
||||||
|
|
||||||
start_time = time()
|
start_time = time()
|
||||||
with CopyHighway(
|
with CopyHighway(
|
||||||
"3 pre | duplicating repo", total=len(list(REPO_DIR.rglob("*")))
|
message="3 pre | duplicating repo",
|
||||||
|
total=len(list(REPO_DIR.rglob("*"))),
|
||||||
|
lff_result=flf_filter_result,
|
||||||
) as copier:
|
) as copier:
|
||||||
copytree(
|
copytree(
|
||||||
src=REPO_DIR,
|
src=REPO_DIR,
|
||||||
|
@ -440,7 +475,7 @@ def main() -> None:
|
||||||
)
|
)
|
||||||
end_time = time()
|
end_time = time()
|
||||||
print(
|
print(
|
||||||
f"3 pre | duplicating repo... done in {end_time - start_time:.2f}″",
|
f"3 pre | duplicating repo... done in {generate_time_elapsed_string(end_time - start_time)}",
|
||||||
flush=True,
|
flush=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -548,7 +583,7 @@ def main() -> None:
|
||||||
r["remote/github"] = "github"
|
r["remote/github"] = "github"
|
||||||
|
|
||||||
step(
|
step(
|
||||||
desc=f"X fin | pushing to github/{branch}",
|
desc=f"X fin | pushing to {r['remote/github']}/{branch}",
|
||||||
func=cmd(
|
func=cmd(
|
||||||
f"git push {r['remote/github']} {branch} --force"
|
f"git push {r['remote/github']} {branch} --force"
|
||||||
if ("--test" not in argv)
|
if ("--test" not in argv)
|
||||||
|
@ -557,14 +592,9 @@ def main() -> None:
|
||||||
)
|
)
|
||||||
|
|
||||||
cumulative_end_time = time()
|
cumulative_end_time = time()
|
||||||
time_taken = cumulative_end_time - cumulative_start_time
|
|
||||||
time_taken_string: str
|
|
||||||
if time_taken > 60:
|
|
||||||
time_taken_string = f"{int(time_taken // 60)}′{int(time_taken % 60)}″"
|
|
||||||
else:
|
|
||||||
time_taken_string = f"{time_taken:.2f}″"
|
|
||||||
print(
|
print(
|
||||||
f"\n--- done! took {time_taken_string}~ " "☆*: .。. o(≧▽≦)o .。.:*☆ ---",
|
f"\n--- done! took {generate_time_elapsed_string(cumulative_end_time - cumulative_start_time)}~ "
|
||||||
|
"☆*: .。. o(≧▽≦)o .。.:*☆ ---",
|
||||||
flush=True,
|
flush=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
Reference in a new issue