This repository has been archived on 2024-08-16. You can view files and clone it, but cannot push or open issues or pull requests.
sota/sync.py
Mark Joshwel e4639b03df tooling: faster repo dupe + std elapsed string gen
rewrite a few things on sidestepper so that we can get back the sim
and ignored directories found by the large file finding algorithn (LargeFileFilterResult)

from 23.5″ to 4.6″ (58″ overall to ~35″ overall) -- approx 5x faster rn
2024-07-27 02:28:57 +08:00

604 lines
18 KiB
Python

# sota staircase ReStepper
# forge -> github one-way repo sync script
# licence: 0BSD
from multiprocessing.pool import ThreadPool
from pathlib import Path
from pprint import pformat
from shutil import copy2, copytree
from subprocess import CompletedProcess
from subprocess import run as _run
from sys import argv, executable
from tempfile import TemporaryDirectory
from textwrap import indent
from time import time
from traceback import format_tb
from typing import Callable, Final, TypeVar
try:
from sidestepper import (
SOTA_SIDESTEP_MAX_WORKERS,
LargeFileFilterResult,
find_large_files,
generate_command_failure_message,
generate_time_elapsed_string,
iter_files,
run,
write_sotaignore,
)
except EnvironmentError:
# specific error raised when third-party modules not found, but were automatically
# installed, so we need to restart the script
exit(_run([executable, Path(__file__).absolute(), *argv[1:]]).returncode)
# we can only guarantee third-party modules are installed after sidestepper
from tqdm import tqdm
# constants
INDENT: Final[str] = " "
REPO_DIR: Final[Path] = Path(__file__).parent
REPO_SOTAIGNORE: Final[Path] = REPO_DIR.joinpath(".sotaignore")
REPO_URL_GITHUB: Final[str] = "github.com/markjoshwel/sota"
REPO_URL_FORGE: Final[str] = "forge.joshwel.co/mark/sota"
COMMIT_MESSAGE: Final[str] = "chore(restep): sync with forge"
COMMIT_AUTHOR: Final[str] = "sota staircase ReStepper <ssrestepper@joshwel.co>"
NEUTERED_GITATTRIBUTES: Final[str] = (
"""# auto detect text files and perform lf normalization\n* text=auto\n"""
)
# dictionary to share state across steps
r: dict[str, str] = {}
R = TypeVar("R")
class CopyHighway:
"""
multithreaded file copying class that gives a copy2-like function
for use with shutil.copytree(); also displays a progress bar
"""
def __init__(self, message: str, total: int, lff_result: LargeFileFilterResult):
"""
multithreaded file copying class that gives a copy2-like function
for use with shutil.copytree()
args:
message: str
message to display in the progress bar
total: int
total number of files to copy
lff_result: LargeFileFilterResult
result of the large file filter
"""
self.pool = ThreadPool(
processes=SOTA_SIDESTEP_MAX_WORKERS,
)
self.pbar = tqdm(
total=total,
desc=message,
unit=" files",
leave=False,
)
self.lff_result = lff_result
def callback(self, a: R):
self.pbar.update()
return a
def copy2(self, source: Path | str, dest: Path | str) -> None:
"""shutil.copy2()-like function for use with shutil.copytree()"""
# ignore check 1: dir
for ign_dir in self.lff_result.ignore_directories:
if str(ign_dir) in str(source):
return None
# ignore check 2: file
# ... we don't need to use the trytrytry method
# ... because we already did that as part of the large file filter,
# ... and as such we checked for it with the first check above
if self.lff_result.matcher.match(source):
return None
self.pool.apply_async(copy2, args=(source, dest), callback=self.callback)
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.pool.close()
self.pool.join()
self.pbar.close()
def _default_post_func(cp: R) -> R:
"""
default post-call function for steps; does nothing
for steps that return a CompletedProcess, this function will run the
`_command_post_func` function
args:
cp: R
return object from a step function
returns: R
the return object from the step function
"""
if isinstance(cp, CompletedProcess):
_command_post_func(cp)
return cp
def _command_post_func(
cp: CompletedProcess,
fail_on_error: bool = True,
quit_early: bool = False,
quit_message: str = "the command gave unexpected output",
) -> CompletedProcess:
"""
default post-call function for command steps; checks if the command was
successful and prints the output if it wasn't
if the command was successful, the stdout and stderr are stored in the
shared state dictionary r under 'stdout' and 'stderr' respectively
args:
cp: CompletedProcess
return object from subprocess.run()
fail_on_error: bool
whether to fail on error
quit_early: bool
whether to quit early
quit_message: str
the message to print if quitting early
returns: CompletedProcess
the return object from subprocess.run()
"""
if quit_early:
print(f"\n\nfailure: {quit_message}\n")
else:
r["stdout"] = cp.stdout.decode() if isinstance(cp.stdout, bytes) else "\0"
r["stderr"] = cp.stderr.decode() if isinstance(cp.stderr, bytes) else "\0"
r["blank/stdout"] = "yes" if (r["stdout"].strip() == "") else ""
r["blank/stderr"] = "yes" if (r["stderr"].strip() == "") else ""
r["blank"] = "yes" if (r["blank/stdout"] and r["blank/stderr"]) else ""
r["errored"] = "" if (cp.returncode == 0) else str(cp.returncode)
# return if the command was successful
# or if we're not failing on error
if (cp.returncode == 0) or (not fail_on_error):
return cp
else:
print(generate_command_failure_message(cp))
exit(
cp.returncode if (isinstance(cp.returncode, int) and cp.returncode != 0) else 1
)
def post_filter_repo_check(cp: CompletedProcess) -> CompletedProcess:
"""
post-call function for checking if git-filter-repo is installed
and optionally installing it if it isn't
"""
if cp.returncode == 0:
return cp
if input("git filter-repo is not installed, install it? y/n: ").lower() != "y":
print(
"install it using 'pip install git-filter-repo' "
"or 'pipx install git-filter-repo'",
)
return cp
# check if pipx is installed
use_pipx = False
check_pipx_cp = run(["pipx", "--version"])
if check_pipx_cp.returncode == 0:
use_pipx = True
# install git-filter-repo
pip_invocation: list[str] = ["pipx"] if use_pipx else [executable, "-m", "pip"]
print(
f"running '{' '.join([*pip_invocation, "install", "git-filter-repo"])}'... ",
end="",
flush=True,
)
install_rc = run([*pip_invocation, "install", "git-filter-repo"])
if install_rc.returncode != 0:
print("error")
_command_post_func(install_rc)
exit(install_rc.returncode)
else:
print("done\n")
# check if it is reachable
if run(["git", "filter-repo", "--version"]).returncode != 0:
# revert
run([*pip_invocation, "uninstall", "git-filter-repo"])
print(
"failure: could not install git-filter-repo automatically. "
"do it yourself o(*≧▽≦)ツ┏━┓"
)
exit(-1)
return cp
def rewrite_gitattributes(target_dir: Path) -> None:
"""
rewrite the .gitattributes file in a directory to disable git-lfs
args:
target_dir: Path
the directory to search
"""
# recursively search for .gitattributes files
for repo_file in target_dir.rglob(".gitattributes"):
repo_file.write_text(NEUTERED_GITATTRIBUTES, encoding="utf-8")
def step(
func: Callable[[], R],
desc: str = "",
post_func: Callable[[R], R] = _default_post_func,
post_print: bool = True,
) -> R:
"""
helper function for running steps
args:
desc: str
description of the step
func: Callable[[], R]
function to run
post_func: Callable[[R], R]
post-function to run after func
post_print: bool
whether to print done after the step
returns:
R
return object from func
"""
# run the function
if desc != "":
print(f"{desc}..", end="", flush=True)
start_time = time()
try:
cp = func()
except Exception as exc:
print(
f"\n\nfailure running step: {exc} ({exc.__class__.__name__})",
"\n".join(format_tb(exc.__traceback__)) + "\n",
sep="\n",
)
exit(1)
if desc != "":
print(".", end="", flush=True)
# run the post-function
try:
rp = post_func(cp)
except Exception as exc:
print(
f"\n\nfailure running post-step: {exc} ({exc.__class__.__name__})",
"\n".join(format_tb(exc.__traceback__)) + "\n",
sep="\n",
)
exit(1)
end_time = time()
# yay
if desc != "" and post_print:
print(
f" done in {generate_time_elapsed_string(end_time - start_time)}",
flush=True,
)
return rp
def post_remote_v(cp: CompletedProcess) -> CompletedProcess:
"""
post-call function for 'git remote -v' command, parses the output and
checks for the forge and github remotes, storing them in the shared state
under 'remote/forge', 'remote/forge/url', 'remote/github', and
'remote/github/url' respectively
"""
if not isinstance(cp.stdout, bytes):
return _command_post_func(cp)
for line in cp.stdout.decode().split("\n"):
# github https://github.com/markjoshwel/sota (fetch)
# github https://github.com/markjoshwel/sota (push)
# origin https://forge.joshwel.co/mark/sota.git (fetch)
# origin https://forge.joshwel.co/mark/sota.git (push)
split_line = line.split(maxsplit=1)
if len(line) < 2:
continue
# remote='origin' url='https://forge.joshwel.co/mark/sota.git (fetch)'
remote, url = split_line
# clean up the url
if (REPO_URL_FORGE in url) or (REPO_URL_GITHUB in url):
# url='https://forge.joshwel.co/mark/sota.git'
url = url.split("(", maxsplit=1)[0].strip()
if REPO_URL_FORGE in url:
r["remote/forge"] = remote
r["remote/forge/url"] = url
elif REPO_URL_GITHUB in url:
r["remote/github"] = remote
r["remote/github/url"] = url
return _command_post_func(cp)
def err(message: str, exc: Exception | None = None) -> None:
"""
helper function for printing error messages, prints the message and the
shared state dictionary r
args:
message: str
the error message to print
exc: Exception | None
the exception that caused the error, if any
"""
print(
"\n" + message,
(
""
if (exc is None)
else indent(
text=(
f"{exc} ({exc.__class__.__name__})\n"
f"{'\n'.join(format_tb(exc.__traceback__))}\n"
),
prefix=INDENT,
)
)
+ (indent(text=pformat(r), prefix=INDENT) + "\n"),
sep="\n",
)
exit(1)
def main() -> None:
"""
command line entry point
"""
cumulative_start_time = time()
with TemporaryDirectory(delete="--keep" not in argv) as dir_temp:
print(
"\nsota staircase ReStepper\n"
"\n"
"directories\n"
f" real repo : {REPO_DIR}\n"
f" temp repo : {dir_temp}\n"
)
# helper partial function for command
def cmd(
command: str,
wd: Path | str = dir_temp,
capture_output: bool = True,
give_input: str | None = None,
) -> Callable[[], CompletedProcess]:
return lambda: run(
command,
cwd=wd,
capture_output=capture_output,
give_input=give_input,
)
step(
func=cmd("git filter-repo --version"),
post_func=post_filter_repo_check,
)
step(cmd("git status --porcelain", wd=REPO_DIR))
if (not r["blank"]) and ("--iknowwhatimdoing" not in argv):
err(
"critical error: repository is not clean, please commit changes first",
)
start_time = time()
print("1 pre | finding large files", end="", flush=True)
files, sim = iter_files(REPO_DIR)
if "--skipsotaignoregen" not in argv:
flf_filter_result = find_large_files(files, sim)
large_files = flf_filter_result.files
end_time = time()
print(
"1 pre | finding large files... "
f"done in {generate_time_elapsed_string(end_time - start_time)} (found {len(large_files)})"
)
if large_files:
start_time = time()
was_written = step(
desc="2 pre | writing .sotaignore",
func=lambda: write_sotaignore(large_files),
post_func=lambda cp: cp,
post_print=False,
)
end_time = time()
if was_written:
print(
f" done in {generate_time_elapsed_string(end_time - start_time)}"
)
else:
print(" not needed")
else:
end_time = time()
print(
"1 pre | finding large files... "
f"skipped in {generate_time_elapsed_string(end_time - start_time)}"
)
print("3 pre | duplicating repo... pre-scanning", end="", flush=True)
start_time = time()
with CopyHighway(
message="3 pre | duplicating repo",
total=len(list(REPO_DIR.rglob("*"))),
lff_result=flf_filter_result,
) as copier:
copytree(
src=REPO_DIR,
dst=dir_temp,
copy_function=copier.copy2,
dirs_exist_ok=True,
)
end_time = time()
print(
f"3 pre | duplicating repo... done in {generate_time_elapsed_string(end_time - start_time)}",
flush=True,
)
step(cmd('python -c "import pathlib; print(pathlib.Path.cwd().absolute())"'))
if str(Path(dir_temp).absolute()) != r["stdout"].strip():
err(
"critical error (whuh? internal?): "
f"not inside the temp dir '{str(Path(dir_temp).absolute())}'"
)
# check for forge and github remotes
step(
func=cmd("git remote -v"),
post_func=post_remote_v,
)
if "remote/forge" not in r:
err("critical error (whuh?): no forge remote found")
# get the current branch
step(cmd("git branch --show-current"))
branch = r["stdout"].strip()
if r.get("errored", "yes") or branch == "":
err("critical error (whuh?): couldn't get current branch")
step(cmd(f"git fetch {r['remote/forge']}"))
step(cmd(f"git rev-list HEAD...{r['remote/forge']}/{branch} --count"))
if (r.get("stdout", "").strip() != "0") and ("--dirty" not in argv):
err(
"critical error (whuh?): "
"not up to date with forge... sync your changes first?"
)
step(desc="4 lfs | fetch lfs objects", func=cmd("git lfs fetch"))
step(
desc="5 lfs | migrating lfs objects",
func=cmd(
'git lfs migrate export --everything --include="*" --remote=origin',
give_input="y\n",
),
)
step(
desc="6 lfs | uninstall lfs in repo",
func=cmd("git lfs uninstall"),
)
step(
func=cmd("git lfs ls-files"),
)
if not r["blank"]:
err(
"critical error (whuh? internal?): "
"lfs objects still exist post-migrate and uninstall"
)
if REPO_SOTAIGNORE.exists():
try:
sotaignore = REPO_SOTAIGNORE.read_text(encoding="utf-8").strip()
except Exception as exc:
err("critical error: couldn't read .sotaignore file", exc=exc)
sotaignored_files: list[str] = [
line
for line in sotaignore.splitlines()
if not line.startswith("#") and line.strip() != ""
]
step(
desc=f"7 lfs | filtering {len(sotaignored_files)} file(s)",
func=cmd(
"git filter-repo --force --invert-paths "
+ " ".join(f'--path ""{lf}' "" for lf in sotaignored_files)
),
)
# also copy to the temp repo; step 5 (lfs migrate) wipes uncommitted changes
copy2(REPO_SOTAIGNORE, Path(dir_temp).joinpath(".sotaignore"))
step(
desc="8 fin | neuter .gitattributes",
func=lambda: rewrite_gitattributes(Path(dir_temp)),
)
def add_and_commit() -> CompletedProcess:
cp = cmd("git add *")()
if cp.returncode != 0:
return cp
return cmd(
"git commit --allow-empty "
f'-am "{COMMIT_MESSAGE}" --author="{COMMIT_AUTHOR}"',
)()
step(
desc="9 fin | commit",
func=add_and_commit,
)
if r.get("remote/github") is None:
step(
func=cmd(f"git remote add github https://{REPO_URL_GITHUB}.git"),
)
if r.get("errored", "yes"):
err("critical error (whuh?): couldn't add github remote")
r["remote/github"] = "github"
step(
desc=f"X fin | pushing to {r['remote/github']}/{branch}",
func=cmd(
f"git push {r['remote/github']} {branch} --force"
if ("--test" not in argv)
else "git --version"
),
)
cumulative_end_time = time()
print(
f"\n--- done! took {generate_time_elapsed_string(cumulative_end_time - cumulative_start_time)}~ "
"☆*: .。. o(≧▽≦)o .。.:*☆ ---",
flush=True,
)
if __name__ == "__main__":
main()