From 48923450f8ebb863c8362d2be26cfe7b373c0a34 Mon Sep 17 00:00:00 2001 From: vikingowl Date: Sun, 22 Feb 2026 08:41:50 +0100 Subject: [PATCH] docs: add architecture plan and research notes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Initial project documentation for workflow-miner — a Rust CLI + zsh plugin that mines recurring command workflows from Atuin shell history. --- docs/architecture.md | 367 +++++++++++++++++++++++++++++++ docs/research-atuin.md | 98 +++++++++ docs/research-existing-tools.md | 114 ++++++++++ docs/research-pattern-mining.md | 103 +++++++++ docs/research-rust-ecosystem.md | 118 ++++++++++ docs/research-zsh-integration.md | 113 ++++++++++ 6 files changed, 913 insertions(+) create mode 100644 docs/architecture.md create mode 100644 docs/research-atuin.md create mode 100644 docs/research-existing-tools.md create mode 100644 docs/research-pattern-mining.md create mode 100644 docs/research-rust-ecosystem.md create mode 100644 docs/research-zsh-integration.md diff --git a/docs/architecture.md b/docs/architecture.md new file mode 100644 index 0000000..fef9fbe --- /dev/null +++ b/docs/architecture.md @@ -0,0 +1,367 @@ +# Architecture + +## Overview + +A Rust CLI + zsh plugin that reads Atuin's shell history SQLite database, mines recurring multi-step command workflows using sequential pattern mining, and surfaces them as reusable recipes. + +## System Diagram + +``` +┌─────────────────────────────────────────────────────┐ +│ CLI binary (wfm) │ +├──────────┬──────────┬───────────┬───────────────────┤ +│ ingest │ mine │ store │ output │ +│ │ │ │ │ +│ read │ prefix │ own │ ┌─ tui dashboard │ +│ atuin │ span │ sqlite │ ├─ zsh widget │ +│ sqlite │ + BIDE │ for │ ├─ justfile gen │ +│ (r/o) │ │ patterns │ ├─ json/yaml │ +│ │ cluster │ │ └─ llm describe │ +│ abstract│ │ │ (optional) │ +│ commands│ │ │ │ +└──────────┴──────────┴───────────┴───────────────────┘ + ▲ │ + │ ▼ + ~/.local/share/atuin/history.db ~/.config/wfm/ + ~/.local/share/wfm/ + workflows.db + justfile (gen) +``` + +## Crate Structure (Cargo workspace) + +``` +wfm/ +├── Cargo.toml # workspace root +├── crates/ +│ ├── wfm-core/ # domain logic, no I/O +│ │ ├── src/ +│ │ │ ├── lib.rs +│ │ │ ├── abstraction.rs # command normalization +│ │ │ ├── prefixspan.rs # PrefixSpan algorithm +│ │ │ ├── clustering.rs # similar pattern dedup +│ │ │ ├── context.rs # project/cwd/git inference +│ │ │ └── workflow.rs # Workflow struct + scoring +│ │ └── Cargo.toml +│ ├── wfm-store/ # persistence (read atuin, write own db) +│ │ ├── src/ +│ │ │ ├── lib.rs +│ │ │ ├── atuin.rs # read atuin history.db +│ │ │ ├── workflows.rs # read/write workflows.db +│ │ │ └── models.rs # shared DB models +│ │ └── Cargo.toml +│ ├── wfm-cli/ # CLI binary +│ │ ├── src/ +│ │ │ ├── main.rs +│ │ │ ├── commands/ +│ │ │ │ ├── mod.rs +│ │ │ │ ├── scan.rs # mine workflows from history +│ │ │ │ ├── list.rs # list discovered workflows +│ │ │ │ ├── show.rs # show workflow detail +│ │ │ │ ├── export.rs # justfile / json / yaml export +│ │ │ │ ├── suggest.rs # suggest next command (for zsh) +│ │ │ │ ├── forget.rs # dismiss/hide a workflow +│ │ │ │ └── describe.rs # LLM naming (optional) +│ │ │ └── tui/ +│ │ │ ├── mod.rs +│ │ │ ├── app.rs # ratatui app state +│ │ │ ├── dashboard.rs # main view +│ │ │ ├── detail.rs # workflow detail view +│ │ │ └── export.rs # export dialog +│ │ └── Cargo.toml +│ └── wfm-zsh/ # zsh plugin (shell scripts, not Rust) +│ ├── wfm.plugin.zsh # main plugin entry +│ ├── functions/ +│ │ └── _wfm # completions +│ └── README.md +└── README.md +``` + +## Core Data Flow + +### Phase 1: Ingest + +``` +atuin history.db + │ + ▼ +SELECT command, timestamp, cwd, session, duration, exit +FROM history WHERE deleted_at IS NULL +ORDER BY session, timestamp + │ + ▼ +Group by session → Vec }> + │ + ▼ +Abstract each command: + "git commit -m 'fix typo'" → ["git", "commit", "-m", ""] + "docker compose up -d" → ["docker", "compose", "up", "-d"] + "cargo test -- --nocapture"→ ["cargo", "test", "--", ""] + │ + ▼ +Optionally enrich with context: + - infer project root from cwd (walk up for .git, Cargo.toml, package.json) + - record git branch if .git exists (shell out to git rev-parse) +``` + +### Phase 2: Mine + +``` +Abstracted session sequences + │ + ▼ +PrefixSpan(min_support=2, max_gap=5, min_length=2, max_length=20) + │ + ▼ +Frequent sequential patterns with support counts + │ + ▼ +BIDE filter → closed patterns only (remove redundant sub-patterns) + │ + ▼ +Cluster similar patterns (edit distance on abstract command sequences) + │ + ▼ +Score patterns: + - frequency (how often) + - recency (when last seen) + - consistency (same order every time? or variations?) + - context specificity (always in same cwd/project? or global?) + │ + ▼ +Ranked list of Workflow candidates +``` + +### Phase 3: Store + +Own SQLite database at `~/.local/share/wfm/workflows.db`: + +```sql +CREATE TABLE workflows ( + id INTEGER PRIMARY KEY, + name TEXT, -- auto-generated or LLM-named + description TEXT, -- optional, LLM-generated + commands TEXT NOT NULL, -- JSON array of abstracted commands + raw_example TEXT NOT NULL, -- JSON: one concrete instance + support INTEGER NOT NULL, -- how many sessions contain this + first_seen INTEGER NOT NULL, -- timestamp + last_seen INTEGER NOT NULL, -- timestamp + avg_duration INTEGER, -- total workflow duration + contexts TEXT, -- JSON array of {cwd, project, branch} + score REAL NOT NULL, -- composite ranking score + status TEXT DEFAULT 'active', -- active | dismissed | exported + created_at INTEGER NOT NULL, + updated_at INTEGER NOT NULL +); + +CREATE TABLE workflow_instances ( + id INTEGER PRIMARY KEY, + workflow_id INTEGER REFERENCES workflows(id), + session TEXT NOT NULL, + timestamp INTEGER NOT NULL, + commands TEXT NOT NULL, -- JSON: concrete commands for this instance + cwd TEXT, + duration INTEGER +); + +CREATE INDEX idx_workflows_score ON workflows(score DESC); +CREATE INDEX idx_workflows_status ON workflows(status); +CREATE INDEX idx_instances_workflow ON workflow_instances(workflow_id); +``` + +### Phase 4: Output + +#### CLI commands + +``` +wfm scan # mine workflows from atuin history +wfm scan --since 30d # only recent history +wfm scan --project . # only current project context +wfm list # list discovered workflows, ranked +wfm list --context . # workflows relevant to current dir +wfm show # detail view of a workflow +wfm export justfile # generate Justfile from top workflows +wfm export json # JSON catalog +wfm export yaml # YAML catalog +wfm suggest # what should I do next? (context-aware) +wfm forget # dismiss a workflow +wfm describe # name/describe with LLM (optional) +wfm describe --all # batch LLM naming +wfm tui # interactive dashboard +``` + +## Command Abstraction Strategy + +The abstractor converts concrete commands into mineable tokens: + +``` +Level 0 (raw): git commit -m "fix: handle null response" +Level 1 (words): ["git", "commit", "-m", "fix: handle null response"] +Level 2 (abstract):["git", "commit", "-m", ""] +Level 3 (verb): ["git", "commit"] +``` + +Mining uses Level 2 by default (preserves flags, abstracts values). Level 3 is a fallback for very noisy histories. + +Special handling: +- Pipelines: split on `|` — abstract each segment independently +- Compound: split on `&&`, `||`, `;` — treat as sub-sequence +- Subshells/redirections: strip `>`, `>>`, `2>&1`, `<()` for pattern matching +- Common tools with known flag semantics: git, docker, kubectl, cargo, npm — maintain a small registry of "flags that take values" for better abstraction + +## Scoring Formula + +``` +score = w_freq * log(support) + + w_recency * decay(days_since_last_seen) + + w_consistency * (1 - variance_in_ordering) + + w_specificity * context_specificity_ratio +``` + +Default weights: `w_freq=0.4, w_recency=0.3, w_consistency=0.2, w_specificity=0.1` + +## TUI Dashboard (ratatui) + +``` +┌─ Workflow Miner ─────────────────────────────────────┐ +│ [Workflows] [Contexts] [Export] │ +├──────────────────────────────────────────────────────┤ +│ # │ Name │ Steps │ Freq │ Last Seen │ +│ 1 │ deploy-staging │ 5 │ 23 │ 2h ago │ +│ 2 │ test-and-commit │ 3 │ 87 │ 15m ago │ +│ 3 │ docker-rebuild │ 4 │ 12 │ 1d ago │ +│ 4 │ debug-api-logs │ 6 │ 8 │ 3d ago │ +│ ►5 │ db-migration │ 4 │ 5 │ 1w ago │ +├──────────────────────────────────────────────────────┤ +│ db-migration (5 occurrences, avg 45s) │ +│ │ +│ 1. cargo sqlx prepare │ +│ 2. cargo sqlx migrate run │ +│ 3. cargo test --test db_tests │ +│ 4. git add migrations/ │ +│ │ +│ Context: ~/projects/myapp (branch: main, feature/*) │ +│ │ +│ [e]xport [d]escribe [f]orget [Enter] instances │ +└──────────────────────────────────────────────────────┘ +``` + +## zsh Plugin + +### Suggestion modes (configurable) + +- **keybinding** (default): User presses `Ctrl+Space` to request a suggestion. Zero overhead. +- **ambient**: `precmd` hook calls `wfm suggest` after every command. Must stay under 50ms. +- **both**: Ambient display + keybinding to accept. + +### Widget sketch + +```zsh +_wfm_suggest() { + local suggestion + suggestion=$("${WFM_BIN:-wfm}" suggest \ + --cwd "$PWD" \ + --last-cmd "$_wfm_last_cmd" \ + --session "$_wfm_session" \ + --format oneline 2>/dev/null) + + if [[ -n "$suggestion" ]]; then + _wfm_suggestion="$suggestion" + zle reset-prompt + fi +} + +_wfm_accept() { + if [[ -n "$_wfm_suggestion" ]]; then + BUFFER="$_wfm_suggestion" + CURSOR=${#BUFFER} + _wfm_suggestion="" + zle reset-prompt + fi +} + +zle -N _wfm_suggest +zle -N _wfm_accept +bindkey '^@' _wfm_accept # Ctrl+Space +``` + +## Configuration + +`~/.config/wfm/config.toml`: + +```toml +[source] +atuin_db = "~/.local/share/atuin/history.db" # auto-detected + +[mining] +min_support = 2 # minimum occurrences to be a workflow +max_gap = 5 # max intervening commands between steps +min_length = 2 # minimum workflow steps +max_length = 20 # maximum workflow steps +scan_window = "90d" # how far back to look by default + +[scoring] +freq_weight = 0.4 +recency_weight = 0.3 +consistency_weight = 0.2 +specificity_weight = 0.1 + +[suggest] +enabled = true +mode = "keybinding" # keybinding | ambient | both +min_score = 0.5 # don't suggest low-confidence workflows +show_in = "rprompt" # rprompt | inline | notification + +[llm] +enabled = false # opt-in +command = "llm" # or "ollama run llama3.2" +model = "" # provider-specific model name + +[export] +justfile_path = "./Justfile.wfm" +``` + +## Dependencies + +```toml +[workspace.dependencies] +rusqlite = { version = "0.38", features = ["bundled"] } +shell-words = "1" +ratatui = "0.30" +crossterm = "0.28" +chrono = { version = "0.4", features = ["serde"] } +clap = { version = "4", features = ["derive"] } +serde = { version = "1", features = ["derive"] } +serde_json = "1" +serde_yaml = "0.9" +anyhow = "1" +dirs = "6" +``` + +## Implementation Phases + +### Phase 1: Foundation (MVP) +1. Cargo workspace setup +2. Atuin DB reader (rusqlite, read-only) +3. Command abstractor (shell-words + custom normalizer) +4. PrefixSpan implementation +5. Own SQLite storage for discovered workflows +6. `wfm scan` and `wfm list` commands + +### Phase 2: Output +7. `wfm show ` with detailed view +8. `wfm export justfile` — generate Justfile recipes +9. `wfm export json` / `wfm export yaml` +10. Scoring and ranking + +### Phase 3: Interactive +11. TUI dashboard (ratatui) +12. `wfm suggest` command +13. zsh plugin with precmd hook + suggestion widget +14. `wfm forget` for dismissing irrelevant patterns + +### Phase 4: Enrichment +15. BIDE closed pattern filter (reduce noise) +16. Pattern clustering (edit distance dedup) +17. Project/git context inference +18. Optional LLM integration (`wfm describe`) diff --git a/docs/research-atuin.md b/docs/research-atuin.md new file mode 100644 index 0000000..0daf51d --- /dev/null +++ b/docs/research-atuin.md @@ -0,0 +1,98 @@ +# Research: Atuin Shell History + +## What is Atuin + +[Atuin](https://github.com/atuinsh/atuin) is a shell history replacement written in Rust. Instead of the flat `~/.zsh_history` text file, it stores commands in SQLite with rich metadata. ~25k GitHub stars, 200k+ users, 220M+ synced history entries. + +- Shells: zsh, bash, fish, nushell, xonsh (PowerShell tier-2) +- Architecture: Rust binary + SQLite at `~/.local/share/atuin/history.db` +- Hooks into shell via `preexec`/`precmd` (zsh) or equivalent +- Doesn't replace existing history file — runs alongside it +- Optional e2e encrypted cross-machine sync (self-hostable or cloud) + +## SQLite Schema + +Database location: `~/.local/share/atuin/history.db` (configurable via `db_path` in `~/.config/atuin/config.toml`, respects `$XDG_DATA_HOME`). + +Database engine: SQLite with WAL journal mode and normal synchronous setting. + +### Tables + +```sql +CREATE TABLE IF NOT EXISTS history ( + id TEXT PRIMARY KEY, + timestamp INTEGER NOT NULL, -- Unix nanoseconds (i64) + duration INTEGER NOT NULL, -- Command duration in nanoseconds + exit INTEGER NOT NULL, -- Exit code + command TEXT NOT NULL, -- The shell command string + cwd TEXT NOT NULL, -- Working directory + session TEXT NOT NULL, -- Terminal session ID + hostname TEXT NOT NULL, -- Machine hostname + deleted_at INTEGER, -- Soft-delete timestamp (nullable) + UNIQUE(timestamp, cwd, command) +); +``` + +### Indexes + +```sql +CREATE INDEX IF NOT EXISTS idx_history_timestamp ON history(timestamp); +CREATE INDEX IF NOT EXISTS idx_history_command ON history(command); +CREATE INDEX IF NOT EXISTS idx_history_command_timestamp ON history(command, timestamp); +``` + +### Rust History struct + +| Field | Rust Type | SQLite Column | +|-------|-----------|---------------| +| `id` | `String` | `id TEXT` | +| `timestamp` | `chrono::DateTime` | `timestamp INTEGER` (nanoseconds) | +| `duration` | `i64` | `duration INTEGER` | +| `exit` | `i64` | `exit INTEGER` | +| `command` | `String` | `command TEXT` | +| `cwd` | `String` | `cwd TEXT` | +| `session` | `String` | `session TEXT` | +| `hostname` | `String` | `hostname TEXT` | +| `deleted_at` | `Option>` | `deleted_at INTEGER` (nullable) | + +### Migrations history + +5 migrations in `crates/atuin-client/migrations/`: +1. `20210422143411_create_history.sql` — initial table + indexes +2. `20220505083406_create-events.sql` — events table (later dropped) +3. `20220806155627_interactive_search_index.sql` — compound index on (command, timestamp) +4. `20230315220114_drop-events.sql` — dropped events table +5. `20230319185725_deleted_at.sql` — added soft-delete column + +### Key observations for workflow mining + +- **Session grouping**: The `session` column groups commands by terminal session — essential for sequential pattern mining. +- **Timestamp ordering**: Nanosecond precision allows precise ordering within sessions. +- **CWD tracking**: `cwd` provides project/directory context. +- **No pipeline decomposition**: `cat foo | grep bar` is stored as a single `command` string. +- **UNIQUE constraint** on `(timestamp, cwd, command)` prevents exact duplicates. +- **Existing indexes** on `command` and `(command, timestamp)` help frequency analysis. +- **Record store** is a separate database (`record.db`) with encrypted sync data — irrelevant for local mining. +- **WAL mode**: Safe for concurrent read access from a separate process. + +### Recommended base query + +```sql +SELECT command, timestamp, cwd, session, duration, exit +FROM history +WHERE deleted_at IS NULL +ORDER BY session, timestamp; +``` + +## Atuin Desktop + +Separate product (open beta, recently open-sourced Apache 2.0). An executable runbook editor with CRDT-based collaboration. Think "Notion but the code blocks run." Autocomplete draws from synced history. Manual authoring — not automatic extraction. + +## Source references + +- [Atuin GitHub](https://github.com/atuinsh/atuin) +- [Atuin website](https://atuin.sh) +- [database.rs](https://github.com/atuinsh/atuin/blob/main/crates/atuin-client/src/database.rs) +- [Migrations directory](https://github.com/atuinsh/atuin/tree/main/crates/atuin-client/migrations) +- [Atuin config docs](https://docs.atuin.sh/cli/configuration/config/) +- [Atuin Desktop blog post](https://blog.atuin.sh/atuin-desktop-runbooks-that-run/) diff --git a/docs/research-existing-tools.md b/docs/research-existing-tools.md new file mode 100644 index 0000000..da44661 --- /dev/null +++ b/docs/research-existing-tools.md @@ -0,0 +1,114 @@ +# Research: Existing Shell History & Workflow Tools + +## Shell History Tools with Context + +### Atuin +See [research-atuin.md](research-atuin.md) for detailed analysis. + +### McFly +- [GitHub](https://github.com/cantino/mcfly) +- Replaces Ctrl+R with full-screen search powered by a small neural network +- Prioritizes suggestions based on: cwd, recently executed commands, frequency, recency, prior selections, exit status +- SQLite storage +- **Closest thing to context-aware command prediction in production**, but predicts single commands, not sequences + +### hishtory +- [GitHub](https://github.com/ddworken/hishtory) +- History with context (hostname, cwd, timestamp, runtime, exit code) in SQLite +- Synced, e2e encrypted +- Custom columns, ChatGPT integration (prefix query with `?`) +- No sequence detection + +### RESH +- [GitHub](https://github.com/curusarn/resh) +- Context-based shell history for zsh and bash +- Records directory, time, exit status +- Filters to current directory by default +- No pattern mining + +### BSH +- [GitHub](https://github.com/karthikeyjoshi/bsh) +- Git-aware, predictive terminal history +- Client-daemon architecture (C++), uses libgit2 for branch resolution +- Renders "Top 5" relevance list via ZLE, <5ms latency +- Can filter by active git branch +- No multi-step sequence detection + +### ContextRecall +- [GitHub](https://github.com/prklm10/contextrecall) +- Rust CLI giving every directory its own isolated shell history +- Auto-detects project roots (`.git`, `Cargo.toml`, `package.json`) +- Per-project SQLite databases + +### Historai +- [GitHub](https://github.com/sanspareilsmyn/historai) +- LLM-powered Go CLI using Google Gemini +- Semantic search over history (`find`) and AI-generated commands using history as context (`suggest`) + +## Manual Workflow / Recipe Tools + +### Atuin Desktop (Runbooks) +- [Blog post](https://blog.atuin.sh/atuin-desktop-runbooks-that-run/) +- Executable runbook editor, CRDT-powered, local-first +- Chains shell commands, DB queries, HTTP requests +- Jinja templating, autocomplete from history +- [Open-sourced](https://blog.atuin.sh/atuin-desktop-open-source/) under Apache 2.0 +- **Manual authoring** — not automatic extraction + +### Warp Terminal Workflows +- [Docs](https://docs.warp.dev/features/warp-drive/workflows) +- Parameterized, reusable command templates +- Community-contributed [workflow repo](https://github.com/warpdotdev/workflows) with 145+ templates +- **Manually authored**, not auto-detected + +### Just / Task / Make +- [Just](https://github.com/casey/just) — command runner with recipe parameters +- [Task](https://taskfile.dev/) — YAML-based task runner with checksum dependencies +- [GNU Make](https://www.gnu.org/software/make/) — classic build tool +- All require **manual authoring** + +### The gap between manual and automatic + +1. **Discovery latency** — You only create a recipe after repeating the workflow many times +2. **Context blindness** — Manual recipes don't know about directory, git branch, or project state +3. **Composition blindness** — Multi-tool workflows (git + docker + kubectl) rarely captured +4. **Personal vs. team** — Individual developer workflows never feel "important enough" for a Makefile + +## Single-Command Suggestion/Autocomplete + +- [zsh-autosuggestions](https://github.com/zsh-users/zsh-autosuggestions) — fish-like inline suggestions from history +- [fzf](https://github.com/junegunn/fzf) — fuzzy finder, commonly bound to Ctrl+R +- [HSTR](https://github.com/dvorka/hstr) — TUI suggest box for browsing/searching history + +## Pattern Analysis (Frequency Only) + +- [lazy](https://github.com/AndrewRPorter/lazy) — analyzes history for most frequent commands, suggests aliases +- [k8au-shell-analyzer](https://github.com/ksauraj/k8au-shell-analyzer) — TUI for usage pattern insights + +## Contextual Signals Used Across Tools + +| Signal | Used By | +|--------|---------| +| Current working directory | McFly, RESH, BSH, ContextRecall, Atuin | +| Git branch | BSH | +| Recently executed commands | McFly | +| Session ID | Atuin, hishtory | +| Exit code | McFly, Atuin, hishtory, BSH | +| Command duration | Atuin, hishtory | +| Hostname | Atuin, hishtory | +| Time of day / recency | McFly, Atuin | +| Prior selection (user feedback) | McFly | + +**No tool combines these contextual signals with sequence mining.** + +## Summary + +| Capability | State of the Art | Gap | +|-----------|-----------------|-----| +| Rich shell history storage | **Solved** (Atuin, hishtory, BSH) | None | +| Context-aware single command suggestion | **Solved** (McFly neural net) | None | +| Manual workflow/recipe authoring | **Solved** (Atuin Desktop, Warp, Just, Task) | None | +| Automatic multi-step recipe extraction | **Research only** (ShRec, 2024) | **Wide open** | +| Shell session summarization (security) | **Exists** (Elastic, RACONTEUR) | Not applied to dev productivity | +| Contextual sequence recommendation | **Theoretical** | No production tool | +| Process mining on shell history | **Never attempted** | Completely unexplored | diff --git a/docs/research-pattern-mining.md b/docs/research-pattern-mining.md new file mode 100644 index 0000000..248045d --- /dev/null +++ b/docs/research-pattern-mining.md @@ -0,0 +1,103 @@ +# Research: Sequential Pattern Mining for Shell History + +## The Key Paper: ShRec (Huawei, 2024) + +**[ShRec: A SRE Behaviour Knowledge Graph Model for Shell Command Recommendations](https://arxiv.org/abs/2408.05592)** — IEEE, August 2024 + +The most directly relevant academic work. Proves that sequential pattern mining on real shell history produces useful results. + +### Approach + +1. Models shell sessions as transactions — each command is an item, the temporal sequence of commands in a session is a transaction +2. Applies sequential pattern mining (frequent sub-sequence extraction) across all sessions +3. Clusters similar sequences using distance metrics and K-means to reduce redundancy +4. Builds a knowledge graph for recommendation + +### Results + +- Extracted 3,997 sequences ranging from 2 to 14 commands +- Support values from 2 to 101 +- Sequences were executed by up to 43 different users +- SREs confirmed the mined sequences matched real operational workflows + +## Sequential Pattern Mining Algorithms + +All implemented in the [SPMF library](https://www.philippe-fournier-viger.com/spmf/index.php?link=algorithms.php) (Java, open source): + +| Algorithm | Approach | Shell History Applicability | +|-----------|----------|-----------------------------| +| **PrefixSpan** | Pattern-growth via prefix-projected databases | **Best fit** — fast, handles variable-length sequences, no candidate generation | +| **GSP** | Apriori-like, level-wise candidate generation | Good for small histories; scales poorly | +| **SPADE** | Vertical ID-list intersection | Viable alternative to PrefixSpan | +| **SPAM** | Bitmap representation | Memory-efficient for dense data | +| **BIDE** | Closed sequential pattern mining | Eliminates redundant patterns — critical for usability | +| **CloSpan/ClaSP** | Closed/maximal patterns | Reduces output size | + +**Recommendation:** PrefixSpan as the primary algorithm. ~200 lines of core logic. [Per the original paper](https://hanj.cs.illinois.edu/pdf/span01.pdf), it consistently outperforms GSP, FreeSpan, and SPADE. + +### Reference implementation + +[PrefixSpan-py](https://github.com/chuanconggao/PrefixSpan-py) — started as 15 lines of code. Includes BIDE (closed patterns) and FEAT (generator patterns). Straightforward to port to Rust. + +## Data Model for Shell History + +- **Item**: A single command (abstracted — e.g., `git commit -m *` rather than the literal message) +- **Transaction/Sequence**: All commands in a session (defined by session ID, or by time-gap heuristic) +- **Database**: All sessions across time + +### Key considerations + +- **Command abstraction is critical.** Literal commands have too much variance. Must normalize arguments while preserving command structure. +- **Session segmentation matters.** Atuin provides session IDs. Without them, use time-gap threshold (>30 min gap = new session). +- **Support threshold needs tuning.** ShRec used min_support=2. For personal use, even 2 surfaces useful patterns. +- **Gap constraints** — want "relaxed" sequential patterns allowing intervening commands (e.g., `git add -> [anything] -> git commit -> [anything] -> git push`). + +## Simpler Alternatives + +### N-gram approaches + +- Bigrams/trigrams of commands: fast, easy, limited to fixed-length contiguous sequences +- Variable-length n-grams with recency decay +- Markov chains: good for "next command" prediction, not full workflow extraction + +### Process Mining (unexplored opportunity) + +[PM4Py](https://github.com/process-intelligence-solutions/pm4py) implements industrial-strength process discovery: + +- **Alpha Miner** — discovers causal relations from directly-follows semantics +- **Heuristics Miner** — handles noise well +- **Inductive Miner** — guaranteed-sound block-structured models + +Maps directly to shell history: Case ID = session, Activity = command, Timestamp = timestamp. Could produce visual workflow models. **Nobody has applied process mining to shell history.** + +Reference: [Workflow Mining: Discovering Process Models from Event Logs](https://www.vdaalst.com/publications/p245.pdf) (van der Aalst et al., 2004) + +## Rust Ecosystem for Pattern Mining + +### No mature SPM crate exists + +The [sequential-pattern-mining GitHub topic](https://github.com/topics/sequential-pattern-mining) lists 15 repos — zero in Rust. + +| Crate | What it does | Verdict | +|-------|-------------|---------| +| `rust-rule-miner` | Association rule mining (Apriori) | Not sequential pattern mining | + +### Recommended approach: Custom PrefixSpan implementation + +Port from PrefixSpan-py. Estimated ~300-500 lines of Rust. The algorithm is recursive and maps cleanly to Rust's type system (`Vec>`). + +## Related Academic Work + +| Paper | Venue | Relevance | +|-------|-------|-----------| +| [ShRec](https://arxiv.org/abs/2408.05592) | IEEE 2024 | Direct precedent — SPM on shell history | +| [PrefixSpan](https://hanj.cs.illinois.edu/pdf/span01.pdf) | ICDE 2001 | The algorithm we'll implement | +| [RACONTEUR](https://arxiv.org/abs/2409.02074) | NDSS 2025 | LLM-powered shell command explainer with RAG | +| [SASH](https://sigops.org/s/conferences/hotos/2025/papers/hotos25-364.pdf) | HotOS 2025 | Static analysis framework for shell, LLM-assisted spec generation | +| [AgentSpec](https://arxiv.org/abs/2503.18666) | ICSE 2026 | DSL for runtime safety constraints on LLM agents | +| Cybersecurity training dataset | [ScienceDirect](https://www.sciencedirect.com/science/article/pii/S2352340921006806) | SPM on shell commands from 175 participants | +| [Workflow Mining](https://www.vdaalst.com/publications/p245.pdf) | van der Aalst 2004 | Process mining foundations | + +## Elastic Security Labs — Session Summarization + +[Using LLMs to summarize user sessions](https://www.elastic.co/security-labs/using-llms-to-summarize-user-sessions) — used GPT-4 to summarize shell sessions for security analysts. Proved the approach works at scale. [Follow-up](https://www.elastic.co/security-labs/using-llms-and-esre-to-find-similar-user-sessions) used semantic retrieval to find similar sessions. Applied to security — nobody has done this for developer productivity. diff --git a/docs/research-rust-ecosystem.md b/docs/research-rust-ecosystem.md new file mode 100644 index 0000000..c99ec5e --- /dev/null +++ b/docs/research-rust-ecosystem.md @@ -0,0 +1,118 @@ +# Research: Rust Ecosystem & Dependencies + +## SQLite Access: rusqlite + +**Winner: [rusqlite](https://docs.rs/rusqlite/0.38) v0.38** + +| Feature | rusqlite | sqlx | +|---------|----------|------| +| Async | No (sync) | Yes | +| SQLite-only | Yes | No (multi-DB) | +| Bundled SQLite | Yes (feature flag) | No | +| Read-only support | Yes (`SQLITE_OPEN_READ_ONLY`) | Yes | + +Why rusqlite: +- Synchronous is fine for a CLI reading a local file +- `Connection::open_with_flags(path, OpenFlags::SQLITE_OPEN_READ_ONLY)` ensures safety +- `features = ["bundled"]` includes SQLite 3.49.2 — no system dependency +- Simpler dependency tree than sqlx +- WAL-safe: reading an Atuin DB in WAL mode from a separate process is safe + +```rust +use rusqlite::{Connection, OpenFlags}; + +let db = Connection::open_with_flags( + "~/.local/share/atuin/history.db", + OpenFlags::SQLITE_OPEN_READ_ONLY | OpenFlags::SQLITE_OPEN_NO_MUTEX, +)?; +``` + +## Shell Command Parsing + +### Level 1: Word splitting — shell-words + +[shell-words](https://docs.rs/shell-words) v1 — POSIX shell word splitting. + +```rust +shell_words::split("git commit -m \"fix typo\"") +// → ["git", "commit", "-m", "fix typo"] +``` + +Simple, correct, well-maintained. Exactly what we need for command abstraction. + +### Level 2: Full AST parsing (optional) + +| Crate | Version | Description | Maturity | +|-------|---------|-------------|----------| +| [brush-parser](https://docs.rs/brush-parser) | 0.3.0 | Full POSIX/bash tokenizer + parser with AST | Active, part of brush-shell | +| [conch-parser](https://github.com/ipetkov/conch-parser) | 0.1.1 | Shell parser with AST builder | Stale | +| [mystsh](https://crates.io/crates/mystsh) | Early | POSIX/bash parser | Newer, active | + +### Level 3: Variable expansion + +[shellexpand](https://docs.rs/shellexpand) v3.1.1 — tilde and `$VAR` expansion. 10M+ downloads. + +## TUI: ratatui + +**Winner: [ratatui](https://github.com/ratatui/ratatui) v0.30** + +| Library | Stars | Status | +|---------|-------|--------| +| **ratatui** | 18.6k | Active, community standard | +| cursive | ~3.8k | Maintained, less active | +| tui-rs | ~10k | **Unmaintained** (ratatui is the fork) | + +Why ratatui: +- Immediate-mode rendering (you own the render loop) +- Widgets: Block, Paragraph, List, Table, Chart, BarChart, Tabs, etc. +- Backends: crossterm (default), termion, termwiz +- 100+ third-party widgets via [awesome-ratatui](https://github.com/ratatui/awesome-ratatui) +- 60+ FPS with complex layouts +- Since 0.30.0: modular architecture (ratatui-core, ratatui-widgets, ratatui-crossterm) + +Relevant widgets for workflow dashboard: +- `Table` — displaying discovered patterns +- `List` — navigating workflows +- `BarChart` / `Sparkline` — frequency visualization +- `Tabs` — switching views + +## CLI Framework: clap + +[clap](https://docs.rs/clap/4) v4 with derive feature — standard, ergonomic. + +## Sequential Pattern Mining + +**No mature Rust crate exists.** Zero repos in Rust on the [sequential-pattern-mining GitHub topic](https://github.com/topics/sequential-pattern-mining). + +Recommended: Implement PrefixSpan directly, porting from [PrefixSpan-py](https://github.com/chuanconggao/PrefixSpan-py). ~300-500 lines of Rust. + +## Justfile Generation + +No Rust crate for generating Justfiles. Format is simple enough for string formatting. + +Key syntax rules: +- Recipe bodies use spaces (with `set indent`) or tabs +- Recipe names: lowercase, hyphens allowed +- Variables: `name := "value"`, interpolated with `{{name}}` +- Dependencies: `test: build lint` +- Parameters: `deploy target="default":` +- Shebang recipes: first line `#!` +- `@` prefix suppresses echo +- Validate with `just --fmt --check --unstable -f ` + +## Recommended Cargo.toml + +```toml +[workspace.dependencies] +rusqlite = { version = "0.38", features = ["bundled"] } +shell-words = "1" +ratatui = "0.30" +crossterm = "0.28" +chrono = { version = "0.4", features = ["serde"] } +clap = { version = "4", features = ["derive"] } +serde = { version = "1", features = ["derive"] } +serde_json = "1" +serde_yaml = "0.9" +anyhow = "1" +dirs = "6" +``` diff --git a/docs/research-zsh-integration.md b/docs/research-zsh-integration.md new file mode 100644 index 0000000..563e5c5 --- /dev/null +++ b/docs/research-zsh-integration.md @@ -0,0 +1,113 @@ +# Research: zsh Plugin Integration + +## Plugin Structure (Universal Minimum) + +``` +my-plugin/ + my-plugin.plugin.zsh # Main entry point (required, name must match dir) + functions/ # Optional: autoloaded functions + _my-plugin # Optional: completion function + bin/ # Optional: executables added to $PATH +``` + +## Plugin Manager Compatibility + +| Manager | Loading mechanism | Install method | +|---------|-------------------|----------------| +| oh-my-zsh | Sources `$ZSH_CUSTOM/plugins//.plugin.zsh` | `git clone` into plugins dir, add to `plugins=()` | +| zinit | `zinit light user/repo` | Configured in `.zshrc` | +| antigen | `antigen bundle user/repo` | Configured in `.zshrc` | +| antidote | Listed in `.zsh_plugins.txt` | Listed in text file | +| sheldon | TOML config | Configured in `sheldon.toml` | +| Manual | `source /path/to/plugin.zsh` | User adds `source` line | + +All managers expect a Git repo with a `.plugin.zsh` file. Host on GitHub and it works everywhere. + +## Zsh Plugin Standard Best Practices + +From the [Zsh Plugin Standard](https://zdharma-continuum.github.io/Zsh-100-Commits-Club/Zsh-Plugin-Standard.html): + +1. **Standardized `$0` handling** for reliable plugin directory detection: + ```zsh + 0="${ZERO:-${${0:#$ZSH_ARGZERO}:-${(%):-%N}}}" + 0="${${(M)0:#/*}:-$PWD/$0}" + ``` + +2. Use `functions/` subdirectory for autoloaded functions and completions (managers add to `$fpath`) +3. Use `bin/` subdirectory for executables (managers add to `$PATH`) +4. Provide an unload function named `{pluginname}_plugin_unload` +5. Use `add-zsh-hook` for hooks rather than defining `precmd`/`preexec` directly +6. Completion files named `_` in `functions/` + +## ZLE (Zsh Line Editor) Mechanisms + +### Key variables +- `$BUFFER` — current command line being edited +- `$LBUFFER` / `$RBUFFER` — left/right of cursor +- `$CURSOR` — cursor position +- `zle -I` — invalidate display (allows printing mid-edit) +- `zle reset-prompt` — redraw prompt after external output +- `zle -R "message"` — display message in status area + +### Widget pattern for LLM/suggestion integration + +```zsh +_my_widget() { + zle -I + local result + result=$(my-binary --some-args 2>/dev/null) + if [[ -n "$result" ]]; then + BUFFER="$result" + CURSOR=${#BUFFER} + fi + zle reset-prompt +} +zle -N _my_widget +bindkey '^X^S' _my_widget +``` + +### Hook functions +- `preexec` — fires after Enter but before execution, receives command as `$1` +- `precmd` — fires after command finishes, before prompt draws +- `zshaddhistory` — fires when a command is added to history +- `chpwd` — fires on directory change + +Registration: +```zsh +autoload -Uz add-zsh-hook +add-zsh-hook preexec my_preexec_fn +add-zsh-hook precmd my_precmd_fn +``` + +## Distribution Strategy for a Rust Binary + zsh Plugin + +The plugin is a thin shell wrapper around the Rust binary: + +``` +wfm-zsh/ + wfm.plugin.zsh # Check binary in PATH, set up hooks, keybindings + functions/ + _wfm # Completion function +``` + +The `.plugin.zsh` file: +1. Checks if the binary is in `$PATH` +2. Sets up shell hooks (`precmd`/`preexec` if ambient mode) +3. Adds completions to `$fpath` +4. Defines ZLE widgets and keybindings +5. Defines convenience aliases/functions + +Distribution: +- Host on GitHub as a standalone repo (works with all managers) +- Provide manual install instructions (`source` line) +- Binary distributed separately via `cargo install` or Homebrew +- Optionally submit to oh-my-zsh as a community plugin + +## References + +- [Zsh Plugin Standard](https://zdharma-continuum.github.io/Zsh-100-Commits-Club/Zsh-Plugin-Standard.html) +- [ZLE documentation](https://zsh.sourceforge.io/Doc/Release/Zsh-Line-Editor.html) +- [ZLE custom widgets guide](https://sgeb.io/posts/zsh-zle-custom-widgets/) +- [oh-my-zsh customization wiki](https://github.com/ohmyzsh/ohmyzsh/wiki/Customization) +- [zinit GitHub](https://github.com/zdharma-continuum/zinit) +- [zsh hooks](https://github.com/rothgar/mastering-zsh/blob/master/docs/config/hooks.md)