Skip to content

Commit 396ca26

Browse files
committed
Start experimenting with indexing history
1 parent 53f415d commit 396ca26

File tree

10 files changed

+328
-1
lines changed

10 files changed

+328
-1
lines changed

Cargo.lock

Lines changed: 17 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ gitbutler-forge = { path = "crates/gitbutler-forge" }
6868
gitbutler-hunk-dependency = { path = "crates/gitbutler-hunk-dependency" }
6969
but-settings = { path = "crates/but-settings" }
7070
gitbutler-workspace = { path = "crates/gitbutler-workspace" }
71+
but-inspection = { path = "crates/but-inspection" }
7172
but = { path = "crates/but" }
7273
but-testsupport = { path = "crates/but-testsupport" }
7374
but-rebase = { path = "crates/but-rebase" }

crates/but-inspection/Cargo.toml

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
[package]
2+
name = "but-inspection"
3+
version = "0.0.0"
4+
edition = "2024"
5+
authors = ["GitButler <[email protected]>"]
6+
publish = false
7+
8+
[lib]
9+
doctest = false
10+
11+
[features]
12+
# Set when building in test-mode to enable features that help with generating repeatable tests.
13+
testing = []
14+
15+
[dependencies]
16+
serde = { workspace = true, features = ["std"] }
17+
bstr.workspace = true
18+
tracing.workspace = true
19+
anyhow = "1.0.98"
20+
gix = { workspace = true, features = [
21+
"dirwalk",
22+
"credentials",
23+
"parallel",
24+
"serde",
25+
"status",
26+
] }
27+
gitbutler-serde.workspace = true
28+
gitbutler-error.workspace = true
29+
gitbutler-command-context.workspace = true
30+
uuid.workspace = true
31+
toml.workspace = true
32+
33+
[dev-dependencies]

crates/but-inspection/src/db.rs

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
//! This is due to be replaced with a real vector database. I just got bored
2+
//! with trying to get them to work.
3+
use std::{
4+
collections::HashMap,
5+
hash::{DefaultHasher, Hash as _, Hasher as _},
6+
path::PathBuf,
7+
};
8+
9+
use anyhow::{Result, bail};
10+
use gitbutler_command_context::CommandContext;
11+
use serde::{Deserialize, Serialize};
12+
13+
#[derive(Serialize, Deserialize, Clone)]
14+
pub struct Hunk {
15+
/// The sha of the commit
16+
#[serde(with = "gitbutler_serde::object_id")]
17+
pub sha: gix::ObjectId,
18+
/// Header
19+
pub header: String,
20+
pub vector: Vec<f32>,
21+
}
22+
23+
impl Hunk {
24+
// We should only ever have one entry per commit & hunk hearder
25+
pub fn key(&self) -> u64 {
26+
let mut hasher = DefaultHasher::new();
27+
(self.sha, &self.header).hash(&mut hasher);
28+
hasher.finish()
29+
}
30+
}
31+
32+
#[derive(Serialize, Deserialize, Clone)]
33+
/// Used to denote a commit has been processed
34+
pub struct Commit {
35+
/// The sha of the commit
36+
#[serde(with = "gitbutler_serde::object_id")]
37+
pub sha: gix::ObjectId,
38+
}
39+
40+
#[derive(Serialize, Deserialize)]
41+
pub struct Db {
42+
pub hunks: Vec<Hunk>,
43+
pub commits: Vec<Commit>,
44+
}
45+
46+
pub struct DbHandle {
47+
path: PathBuf,
48+
}
49+
50+
// TODO: Replace with real vector database
51+
impl DbHandle {
52+
pub fn new(ctx: &CommandContext) -> Self {
53+
Self {
54+
path: ctx.project().gb_dir().join("inspection.toml"),
55+
}
56+
}
57+
58+
pub fn read(&self) -> Result<Db> {
59+
if std::fs::exists(&self.path)? {
60+
let content = std::fs::read_to_string(&self.path)?;
61+
let content: Db = toml::from_str(&content)?;
62+
Ok(content)
63+
} else {
64+
Ok(Db {
65+
hunks: vec![],
66+
commits: vec![],
67+
})
68+
}
69+
}
70+
71+
fn write(&self, db: &Db) -> Result<()> {
72+
let content = toml::to_string(db)?;
73+
std::fs::write(&self.path, content)?;
74+
Ok(())
75+
}
76+
77+
pub fn upsert_many_hunks(&self, entries: &[Hunk]) -> Result<Vec<Hunk>> {
78+
let mut db = self.read()?;
79+
let mut map = db
80+
.hunks
81+
.into_iter()
82+
.map(|e| (e.key(), e))
83+
.collect::<HashMap<u64, Hunk>>();
84+
85+
for e in entries {
86+
map.insert(e.key(), e.clone());
87+
}
88+
89+
db.hunks = map.into_values().collect::<Vec<_>>();
90+
91+
self.upsert_many_commits(
92+
&entries
93+
.iter()
94+
.map(|h| Commit { sha: h.sha })
95+
.collect::<Vec<Commit>>(),
96+
)?;
97+
98+
self.write(&db)?;
99+
100+
Ok(db.hunks)
101+
}
102+
103+
pub fn upsert_many_commits(&self, entries: &[Commit]) -> Result<Vec<Commit>> {
104+
let mut db = self.read()?;
105+
let mut map = db
106+
.commits
107+
.into_iter()
108+
.map(|e| (e.sha, e))
109+
.collect::<HashMap<gix::ObjectId, Commit>>();
110+
111+
for e in entries {
112+
map.insert(e.sha, e.clone());
113+
}
114+
115+
db.commits = map.into_values().collect::<Vec<_>>();
116+
117+
self.write(&db)?;
118+
119+
Ok(db.commits)
120+
}
121+
122+
// TODO: Replace with real vector db search rather than a manual implementation.
123+
pub fn search_hunks(&self, term: Vec<f32>, cutoff: Option<usize>) -> Result<Vec<(Hunk, f32)>> {
124+
let db = self.read()?;
125+
126+
let mut with_distance = db
127+
.hunks
128+
.into_iter()
129+
.map(|i| {
130+
let distance = cosine_distance(&i.vector, &term)?;
131+
Ok((i, distance))
132+
})
133+
.collect::<Result<Vec<(Hunk, f32)>>>()?;
134+
135+
// Sort decending
136+
with_distance.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
137+
138+
if let Some(cutoff) = cutoff {
139+
Ok(with_distance.into_iter().take(cutoff).collect())
140+
} else {
141+
Ok(with_distance)
142+
}
143+
}
144+
}
145+
146+
fn cosine_distance(a: &[f32], b: &[f32]) -> Result<f32> {
147+
if a.len() != b.len() {
148+
bail!("Vectors MUST be the same length!")
149+
};
150+
151+
let dot: f32 = a.iter().zip(b).map(|(a, b)| a * b).sum();
152+
Ok(dot / a.len() as f32)
153+
}

crates/but-inspection/src/lib.rs

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
//! Hello there friend!
2+
3+
use anyhow::Result;
4+
use gitbutler_command_context::CommandContext;
5+
use serde::Serialize;
6+
7+
use crate::db::DbHandle;
8+
9+
mod db;
10+
11+
#[derive(Serialize, Debug)]
12+
pub struct RepositoryIndexStats {
13+
/// How many commits we have indexed
14+
commits_indexed: usize,
15+
/// How many commits are reachable
16+
total_commits: usize,
17+
/// How many reachable commits are indexed
18+
reachable_commits_indexed: usize,
19+
}
20+
pub async fn index_stats(ctx: &CommandContext) -> Result<RepositoryIndexStats> {
21+
let db_handle = DbHandle::new(ctx);
22+
let db = db_handle.read()?;
23+
let all_commits = commits_to_index(ctx)?;
24+
25+
let reachable_commits_indexed = db
26+
.commits
27+
.iter()
28+
.filter(|c| all_commits.contains(&c.sha))
29+
.count();
30+
31+
Ok(RepositoryIndexStats {
32+
commits_indexed: db.commits.len(),
33+
total_commits: all_commits.len(),
34+
reachable_commits_indexed,
35+
})
36+
}
37+
38+
#[derive(Serialize, Debug)]
39+
pub struct EmbeddingsResult {}
40+
pub async fn generate_embeddings() -> Result<EmbeddingsResult> {
41+
todo!()
42+
}
43+
44+
/// Lists all commits referenced by references
45+
fn commits_to_index(ctx: &CommandContext) -> Result<Vec<gix::ObjectId>> {
46+
let repo = ctx.gix_repo()?;
47+
48+
let all_references = repo
49+
.references()?
50+
.all()?
51+
.map(|r| {
52+
let id = r.map_err(|_| anyhow::anyhow!("Ahh"))?.try_id();
53+
if let Some(id) = id {
54+
if id.object()?.kind == gix::object::Kind::Commit {
55+
Ok(Some(id.detach()))
56+
} else {
57+
Ok(None)
58+
}
59+
} else {
60+
Ok(None)
61+
}
62+
})
63+
.filter_map(Result::transpose)
64+
.collect::<Result<Vec<gix::ObjectId>>>()?;
65+
66+
let commits = repo.rev_walk(all_references).all()?;
67+
68+
commits
69+
.into_iter()
70+
.map(|c| Ok(c?.id))
71+
.collect::<Result<Vec<_>>>()
72+
}

crates/but/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,5 +29,6 @@ but-action.workspace = true
2929
but-graph.workspace = true
3030
but-workspace.workspace = true
3131
but-settings.workspace = true
32+
but-inspection.workspace = true
3233
gitbutler-command-context.workspace = true
3334
serde_json = "1.0.140"

crates/but/src/args.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,4 +37,18 @@ pub enum Subcommands {
3737
#[clap(long, short = 's', default_value_t = 10)]
3838
page_size: i64,
3939
},
40+
/// The family of but inspection commands
41+
BetaInspect(Inspect),
42+
}
43+
44+
#[derive(Debug, clap::Parser)]
45+
pub struct Inspect {
46+
#[clap(subcommand)]
47+
pub cmd: InspectSubcommands,
48+
}
49+
50+
#[derive(Debug, clap::Subcommand)]
51+
pub enum InspectSubcommands {
52+
/// Get the current index status
53+
Status,
4054
}

crates/but/src/command/inspect.rs

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
use std::path::Path;
2+
3+
use super::print;
4+
use but_settings::AppSettings;
5+
use gitbutler_command_context::CommandContext;
6+
use gitbutler_project::Project;
7+
8+
pub fn status(repo_path: &Path, json: bool) -> anyhow::Result<()> {
9+
let repo_path = repo_path.to_owned();
10+
11+
std::thread::spawn(move || {
12+
tokio::runtime::Runtime::new().map(|r| r.block_on(status_inner(&repo_path, json)))
13+
})
14+
.join()
15+
.unwrap()??;
16+
17+
Ok(())
18+
}
19+
20+
async fn status_inner(repo_path: &Path, json: bool) -> anyhow::Result<()> {
21+
let project = Project::from_path(repo_path).expect("Failed to create project from path");
22+
let ctx = CommandContext::open(&project, AppSettings::default())?;
23+
24+
let outcome = but_inspection::index_stats(&ctx).await?;
25+
26+
print(&outcome, json)
27+
}

crates/but/src/command/mod.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ use gitbutler_command_context::CommandContext;
77
use gitbutler_project::Project;
88
use serde::Serialize;
99

10+
pub(crate) mod inspect;
11+
1012
pub(crate) fn handle_changes(
1113
repo_path: &Path,
1214
json: bool,
@@ -40,7 +42,7 @@ pub(crate) fn list_actions(
4042
print(&response, json)
4143
}
4244

43-
fn print<T>(this: &T, json: bool) -> anyhow::Result<()>
45+
pub(crate) fn print<T>(this: &T, json: bool) -> anyhow::Result<()>
4446
where
4547
T: ?Sized + Serialize + std::fmt::Debug,
4648
{

crates/but/src/main.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ use anyhow::Result;
22

33
mod args;
44
use args::Args;
5+
6+
use crate::args::Inspect;
57
mod command;
68
mod mcp;
79
mod mcp_internal;
@@ -20,5 +22,10 @@ async fn main() -> Result<()> {
2022
args::Subcommands::ListActions { page, page_size } => {
2123
command::list_actions(&args.current_dir, args.json, *page, *page_size)
2224
}
25+
args::Subcommands::BetaInspect(Inspect { cmd }) => match cmd {
26+
args::InspectSubcommands::Status => {
27+
command::inspect::status(&args.current_dir, args.json)
28+
}
29+
},
2330
}
2431
}

0 commit comments

Comments
 (0)