Compare commits

..

10 commits

Author SHA1 Message Date
2eeba9ec9d chore: 5.3.5+500b60 2025-01-22 01:22:12 +08:00
500b6067b2 sidestepper: dont start and end with padding "\n"s
also actually error out if we mess up with std::process::exit
2025-01-22 01:09:26 +08:00
c235c63194 sidestepper: better error handling and posix paths 2025-01-22 00:20:25 +08:00
Mark Joshwel
82993fb4a5
docs: damn it 2025-01-21 14:17:48 +00:00
5f60a5fad2 sidestepper: yeah its done lol
this language is kinda nice actually
2025-01-21 22:12:25 +08:00
fa05392348 docs: update 2025-01-21 22:12:07 +08:00
3823294151 cargo: update 2025-01-21 22:11:59 +08:00
445dbe2424 docs: oopses 2: update wording
god damn
2025-01-21 15:59:53 +08:00
cf3a7cdc7c sidestepper: forego parallelism
i doubt it'd speed up much, that's an experiment for another time
2025-01-21 15:59:05 +08:00
4f7c3e0fbd docs: oopses 2025-01-20 05:05:26 +08:00
6 changed files with 504 additions and 77 deletions

1
.gitignore vendored
View file

@ -1 +1,2 @@
/target
/releases

263
Cargo.lock generated
View file

@ -2,6 +2,267 @@
# It is not intended for manual editing.
version = 3
[[package]]
name = "aho-corasick"
version = "1.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
dependencies = [
"memchr",
]
[[package]]
name = "bstr"
version = "1.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "531a9155a481e2ee699d4f98f43c0ca4ff8ee1bfd55c31e9e98fb29d2b176fe0"
dependencies = [
"memchr",
"serde",
]
[[package]]
name = "crossbeam-deque"
version = "0.8.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
dependencies = [
"crossbeam-epoch",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-epoch"
version = "0.9.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
dependencies = [
"crossbeam-utils",
]
[[package]]
name = "crossbeam-utils"
version = "0.8.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
[[package]]
name = "globset"
version = "0.4.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "15f1ce686646e7f1e19bf7d5533fe443a45dbfb990e00629110797578b42fb19"
dependencies = [
"aho-corasick",
"bstr",
"log",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "ignore"
version = "0.4.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d89fd380afde86567dfba715db065673989d6253f42b88179abd3eae47bda4b"
dependencies = [
"crossbeam-deque",
"globset",
"log",
"memchr",
"regex-automata",
"same-file",
"walkdir",
"winapi-util",
]
[[package]]
name = "log"
version = "0.4.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04cbf5b083de1c7e0222a7a51dbfdba1cbe1c6ab0b15e29fff3f6c077fd9cd9f"
[[package]]
name = "memchr"
version = "2.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
[[package]]
name = "proc-macro2"
version = "1.0.93"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60946a68e5f9d28b0dc1c21bb8a97ee7d018a8b322fa57838ba31cc878e22d99"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.38"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc"
dependencies = [
"proc-macro2",
]
[[package]]
name = "regex-automata"
version = "0.4.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
[[package]]
name = "same-file"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
dependencies = [
"winapi-util",
]
[[package]]
name = "serde"
version = "1.0.217"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.217"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "sidestepper"
version = "5.0.0"
version = "5.3.5+500b606"
dependencies = [
"ignore",
]
[[package]]
name = "syn"
version = "2.0.96"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d5d0adab1ae378d7f53bdebc67a39f1f151407ef230f0ce2883572f5d8985c80"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "unicode-ident"
version = "1.0.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
[[package]]
name = "walkdir"
version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
dependencies = [
"same-file",
"winapi-util",
]
[[package]]
name = "winapi-util"
version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
dependencies = [
"windows-sys",
]
[[package]]
name = "windows-sys"
version = "0.59.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
dependencies = [
"windows-targets",
]
[[package]]
name = "windows-targets"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_gnullvm",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_gnullvm",
"windows_x86_64_msvc",
]
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
[[package]]
name = "windows_i686_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
[[package]]
name = "windows_i686_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
[[package]]
name = "windows_i686_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"

View file

@ -1,6 +1,7 @@
[package]
name = "sidestepper"
version = "5.0.0"
version = "5.3.5+500b606"
edition = "2021"
[dependencies]
ignore = "0.4.23"

View file

@ -1,4 +1,4 @@
BSD Zero Clause License
BSD Zero Clause License
Copyright (c) 2025 mark joshwel <mark@joshwel.co>

View file

@ -1,14 +1,22 @@
# sota staircase SideStepper
# sota staircase SideStepper
a fast .gitignore-respecting large file finder for .git repositories trying to
weed out large LFS files
a fast enough .gitignore-respecting large file finder
made for me to find large files in my unity game repositories that wouldn't
fit github's 100mb limit, for when i'd push my repositories to github for
schoolwork submission
rewritten from [python](https://forge.joshwel.co/mark/sota/src/branch/main/sidestepper.py)
to rust, as a reason to learn rust
**this is brain made software**: large language-based code generation has not
directly used here. but i'd be lying if i said i didn't ask chatgpt if there
was a better way to check a boolean result lol
been extensively used here. but i'd be lying if i said i didn't ask chatgpt if
there was a better way to check a boolean result lol
## quickstart
**note:** there aren't any releases nor a nix flake yet!
### installing a binary
**note:** all non-windows builds are statically linked
@ -24,9 +32,9 @@ was a better way to check a boolean result lol
### build it yourself
1. [get rust and cargo](https://doc.rust-lang.org/cargo/getting-started/installation.html#install-rust-and-cargo)
2. `cargo build release`
2. run `cargo build release`
**nix users, rejoice:** `nix run github:markjoshwel/sidestepper` or `nix run git+:https://forge.joshwel.co/mark/sidestepper`
**nix users, rejoice:** run `nix run github:markjoshwel/sidestepper` or `nix run git+:https://forge.joshwel.co/mark/sidestepper`
### running it
@ -41,20 +49,20 @@ or on windows,
```
it'll find for a `.git` directory in the current or parent directories, if you
want to use this not in the context i usually use this for, pass in
`--search-here` to treat the current working directory as the 'repository root'
want to use this not in the context i usually use this for (which is for git
repositories), pass in `--search-here` to treat the current working directory
as the 'repository root'
it'll then make a `.sotaignore` file that i use in my other tooling,
but if you want output more friendly for integration in other places,
pass in `--plumbing` for it to output encountered large files, line-by-line, to
stdout
it'll then make a `.sotaignore` file that i use in my other tooling scripts,
but if you want it to output external-tool-friendly output to stdout, pass in
`--plumbing` for it to output encountered large files, line-by-line, to stdout
## historical changes
- v5 (i3/a4) - rewritten in rust lol
- v5 (i3/a5) - 3rd implementation, rewritten in rust lol (no longer using iod-ttt, just piggybacking off [ignore](https://crates.io/crates/ignore)'s WalkBuilder)
- v4 (i2/a4) - optimised single iod-ttt
- v3 (i2/a3) - faster matching by remembering ignored directories (ignore on demand, 'iod')
- v2 (i2/a2) - corrected ignored directory matching (named 'trytrytry')
- v2 (i2/a2) - 2nd implementation, corrected ignored directory matching (named 'trytrytry', 'ttt')
- v1 (i1/a1) - original python script, still embedded within ReStepper
## licence

View file

@ -14,88 +14,190 @@
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
// IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
use std::env;
use ignore;
use std::error::Error;
use std::fs::metadata;
use std::io::Write;
use std::path::{Path, PathBuf};
use std::process::exit;
use std::time::{Duration, SystemTime};
use std::{env, fs, io, path};
const SOTA_SIDESTEP_CHUNK_SIZE: u16 = 16;
const SOTA_SIDESTEP_MAX_WORKERS: u16 = 4;
const SOTA_SIDESTEP_LARGE_FILE_SIZE: u64 = 100000000; // 100mb
#[derive(Debug)]
struct Behaviour {
repo_dir_path: PathBuf,
repo_sotaignore_path: PathBuf,
parallel: bool,
chunk_size: u16,
max_workers: u16,
large_file_size: u64,
plumbing: bool,
}
fn cli_get_behaviour() -> Result<Behaviour, Box<dyn Error>> {
// get environment variables
let chunk_size: u16 = match env::var("SOTA_SIDESTEP_CHUNK_SIZE") {
Ok(val) => val.parse::<u16>().unwrap_or(SOTA_SIDESTEP_CHUNK_SIZE),
Err(_) => SOTA_SIDESTEP_CHUNK_SIZE,
};
let max_workers: u16 = match env::var("SOTA_SIDESTEP_MAX_WORKERS") {
Ok(val) => val.parse::<u16>().unwrap_or(SOTA_SIDESTEP_MAX_WORKERS),
Err(_) => SOTA_SIDESTEP_MAX_WORKERS,
};
let large_file_size: u64 = match env::var("SOTA_SIDESTEP_LARGE_FILE_SIZE") {
Ok(val) => val.parse::<u64>().unwrap_or(SOTA_SIDESTEP_LARGE_FILE_SIZE),
Err(_) => SOTA_SIDESTEP_LARGE_FILE_SIZE,
};
let parallel: bool = 'get_parallel: {
// future me move this to a higher block if we ever need args
// anywhere else also what the hell, labeled blocks?
// huh -- the community seems wishy-washy on it,
// but this seems like a harmless use of em
let args: Vec<String> = env::args().collect();
if env::var("SOTA_SIDESTEP_PARALLEL").is_ok() {
break 'get_parallel true;
}
if args.iter().any(|arg| arg == "--parallel") {
break 'get_parallel true;
}
false
};
// find repo dir
// go through each parent dir until one of them has a .git directory in it
let current_dir = env::current_dir().unwrap();
// look through args and see if the '--search-here' or '--plumbing' flags are present
let mut search_here: bool = false;
let mut plumbing: bool = false;
for arg in env::args() {
if arg == "--search-here" {
search_here = true;
}
if arg == "--plumbing" {
plumbing = true;
}
}
let current_dir = env::current_dir().map_err(|_| "could not get current working directory")?;
// if we're searching here anywas, return early using the current dir
if search_here {
return Ok(Behaviour {
repo_dir_path: PathBuf::from(&current_dir),
repo_sotaignore_path: PathBuf::from(&current_dir.join(".sotaignore")),
large_file_size,
plumbing,
});
}
// else, find the repo dir
// (go through each parent dir until one of them has a .git directory in it)
let mut dir = current_dir.as_path();
let mut possible_repo_dir_path: Option<&Path> = None;
while dir.components().count() > 1 {
// check if there's a .git directory nearby
if dir.join(".git/").try_exists().ok() == Some(true) {
possible_repo_dir_path = Option::from(dir);
break;
let repo_dir_path: PathBuf = loop {
if dir.join(".git").try_exists().unwrap_or(false) {
break dir.into();
}
// iterate down!
if let Some(parent) = dir.parent() {
dir = parent;
} else {
break;
return Err(
"could not find a .git repository in the current or parent directories".into(),
);
}
}
if possible_repo_dir_path.is_none() {
return Err("could not find a .git repository in the current or parent directories".into());
}
let repo_dir_path = possible_repo_dir_path.unwrap();
};
Ok(Behaviour {
repo_dir_path: PathBuf::from(repo_dir_path),
repo_sotaignore_path: PathBuf::from(repo_dir_path.join(".sotaignore")),
parallel,
chunk_size,
max_workers,
repo_dir_path: PathBuf::from(&repo_dir_path),
repo_sotaignore_path: PathBuf::from(&repo_dir_path.join(".sotaignore")),
large_file_size,
plumbing,
})
}
fn ss_scan_for_unignored_files(behaviour: &Behaviour) -> Vec<PathBuf> {
ignore::WalkBuilder::new(&behaviour.repo_dir_path)
.hidden(false)
.build()
.filter_map(|e| e.ok())
.filter(|file| {
!file
.path()
.starts_with(Path::new(&behaviour.repo_dir_path).join(".git/"))
&& file.path().is_file()
})
.map(|file| file.into_path())
.collect()
}
fn ss_check_for_large_files(behaviour: &Behaviour, files: &Vec<PathBuf>) -> Vec<PathBuf> {
files
.iter()
.filter_map(|file| {
metadata(file)
.ok()
.filter(|meta| meta.len() >= behaviour.large_file_size)
.map(|_| file.into())
})
.collect()
}
fn ss_write_sotaignore(behaviour: &Behaviour, large_files: &Vec<PathBuf>) -> io::Result<bool> {
if large_files.is_empty() {
return Ok(false);
}
// are we outputting to stdout for other programs?
// do so and return true, we did write something
if behaviour.plumbing {
eprintln!();
for file in large_files {
println!("{}", file.to_str().unwrap_or("".into()));
}
return Ok(true);
}
let old_sotaignore = if behaviour.repo_sotaignore_path.try_exists().unwrap_or(false) {
fs::read_to_string(&behaviour.repo_sotaignore_path)?
.lines()
.map(String::from)
.collect::<Vec<String>>()
} else {
Vec::new()
};
let mut new_sotaignore = old_sotaignore.clone();
for file in large_files {
if let Ok(file_relative) = file.strip_prefix(&behaviour.repo_dir_path) {
let fallback = &file.to_string_lossy();
let relative_path_str = file_relative
.to_str()
.unwrap_or(file.to_str().unwrap_or(fallback));
// posix-path-ify it for cross compatibility
if !old_sotaignore.contains(&relative_path_str.to_string()) {
new_sotaignore.push({
if path::MAIN_SEPARATOR_STR == "\\" {
relative_path_str.to_string().replace("\\", "/")
} else {
relative_path_str.to_string()
}
})
}
}
}
// no new changes? return, nothing has been written
if new_sotaignore == old_sotaignore {
return Ok(false);
}
// check if the sotaignore file starts with a comment
if !new_sotaignore.is_empty() & !new_sotaignore[0].starts_with("#") {
let header = vec![
"# .sotaignore file generated by sota staircase ReStepper/SideStepper",
"# anything here either can't or shouldn't be uploaded to GitHub",
"# unless you know what you're doing, don't edit this file! >:(",
];
new_sotaignore.splice(0..0, header.iter().map(|&line| line.to_string()));
}
let mut sotaignore_file = fs::File::create(&behaviour.repo_sotaignore_path)?;
sotaignore_file.write_all(new_sotaignore.join("\n").as_bytes())?;
sotaignore_file.write_all(b"\n")?;
Ok(true)
}
fn format_elapsed_time(secs: f64) -> String {
let hours = (secs / 3600.0).floor() as i64;
let minutes = ((secs % 3600.0) / 60.0).floor() as i64;
let seconds = (secs % 60.0).round() as f64;
let secs_string: String;
if secs > 3600.0 {
secs_string = format!("{}h {} {:.1}", hours, minutes, seconds);
} else if secs > 60.0 {
secs_string = format!("{} {:.2}", minutes, seconds);
} else {
secs_string = format!("{:.3}", secs);
}
secs_string
}
fn main() {
eprintln!("sota staircase SideStepper v5 (i3/a4)");
eprintln!("sota staircase SideStepper v5 (i3/a5)");
let behaviour = {
let behaviour = cli_get_behaviour();
// huh. pattern matching consumes the variable, so we ref (&) it. damn.
@ -105,17 +207,71 @@ fn main() {
}
behaviour.unwrap()
};
eprintln!(
" repo root : {}\n .sotaignore : {} ({})\n parallel : {}",
behaviour.repo_dir_path.to_str().unwrap(),
behaviour.repo_sotaignore_path.to_str().unwrap(),
" repo root : {}\n .sotaignore : {}\n",
behaviour.repo_dir_path.to_string_lossy(),
{
if behaviour.repo_sotaignore_path.try_exists().ok() == Some(true) {
"exists"
if behaviour.plumbing {
"(stdout)".into()
} else {
"non-existent"
format!(
"{} ({})",
behaviour.repo_sotaignore_path.to_string_lossy(),
match behaviour.repo_sotaignore_path.try_exists() {
Ok(true) => "exists",
Ok(false) => "non-existent",
Err(_) => "unknown",
}
)
}
},
behaviour.parallel
);
let zero_duration = Duration::new(0, 0);
let all = SystemTime::now();
eprint!("1/3 scanning repository... ");
let now = SystemTime::now();
let files = ss_scan_for_unignored_files(&behaviour);
eprintln!(
"done in {} (found {})",
format_elapsed_time(now.elapsed().unwrap_or(zero_duration).as_secs_f64()),
files.len()
);
eprint!("2/3 finding large files... ");
let now = SystemTime::now();
let large_files = ss_check_for_large_files(&behaviour, &files);
eprintln!(
"done in {} (found {})",
format_elapsed_time(now.elapsed().unwrap_or(zero_duration).as_secs_f64()),
large_files.len()
);
eprint!("3/3 writing .sotaignore file... ");
match ss_write_sotaignore(&behaviour, &large_files) {
Ok(true) => {
eprintln!(
"{}",
if behaviour.plumbing {
"done (to stdout)"
} else {
"done"
}
);
}
Ok(false) => {
eprintln!("skipped")
}
Err(e) => {
eprintln!("error: ({})", e);
exit(2)
}
}
eprintln!(
"\n--- done! took {} ″~ ☆*: .。. o(≧▽≦)o .。.:*☆ ---",
format_elapsed_time(all.elapsed().unwrap_or(zero_duration).as_secs_f64())
);
}