Skip to content

Commit 5d2cc64

Browse files
committed
Start experimenting with indexing history
1 parent 453891a commit 5d2cc64

File tree

12 files changed

+4577
-997
lines changed

12 files changed

+4577
-997
lines changed

Cargo.lock

Lines changed: 3904 additions & 995 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ tracing = "0.1.41"
3131
tracing-subscriber = "0.3.19"
3232
tempfile = "3.14"
3333
rand = "0.9.1"
34+
async-openai = "0.28.3"
35+
tiktoken-rs="0.7"
3436

3537
gitbutler-id = { path = "crates/gitbutler-id" }
3638
gitbutler-git = { path = "crates/gitbutler-git" }
@@ -68,6 +70,7 @@ gitbutler-forge = { path = "crates/gitbutler-forge" }
6870
gitbutler-hunk-dependency = { path = "crates/gitbutler-hunk-dependency" }
6971
but-settings = { path = "crates/but-settings" }
7072
gitbutler-workspace = { path = "crates/gitbutler-workspace" }
73+
but-inspection = { path = "crates/but-inspection" }
7174
but = { path = "crates/but" }
7275
but-testsupport = { path = "crates/but-testsupport" }
7376
but-rebase = { path = "crates/but-rebase" }

crates/but-action/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ test = false
1212
[dependencies]
1313
serde = { workspace = true, features = ["std"] }
1414
serde-error = "0.1.3"
15-
async-openai = "0.28.2"
15+
async-openai.workspace = true
1616
tokio = { workspace = true, features = ["rt-multi-thread", "io-std"] }
1717
schemars = "0.9.0"
1818
serde_json = "1.0.138"

crates/but-inspection/Cargo.toml

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
[package]
2+
name = "but-inspection"
3+
version = "0.0.0"
4+
edition = "2024"
5+
authors = ["GitButler <[email protected]>"]
6+
publish = false
7+
8+
[lib]
9+
doctest = false
10+
11+
[features]
12+
# Set when building in test-mode to enable features that help with generating repeatable tests.
13+
testing = []
14+
15+
[dependencies]
16+
serde = { workspace = true, features = ["std"] }
17+
bstr.workspace = true
18+
tracing.workspace = true
19+
anyhow = "1.0.98"
20+
gix = { workspace = true, features = [
21+
"dirwalk",
22+
"credentials",
23+
"parallel",
24+
"serde",
25+
"status",
26+
] }
27+
async-openai.workspace = true
28+
tiktoken-rs.workspace = true
29+
gitbutler-serde.workspace = true
30+
gitbutler-error.workspace = true
31+
gitbutler-command-context.workspace = true
32+
uuid.workspace = true
33+
toml.workspace = true
34+
serde_json = "1.0.140"
35+
but-core.workspace = true
36+
lancedb = "0.20"
37+
# Should match what lancedb depends on
38+
arrow-array = "55.1"
39+
arrow-schema = "55.1"
40+
41+
[dev-dependencies]

crates/but-inspection/src/db.rs

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
//! This is due to be replaced with a real vector database. I just got bored
2+
//! with trying to get them to work.
3+
use std::{
4+
collections::HashMap,
5+
hash::{DefaultHasher, Hash, Hasher as _},
6+
path::PathBuf,
7+
};
8+
9+
use anyhow::{Result, bail};
10+
use bstr::BString;
11+
use gitbutler_command_context::CommandContext;
12+
use serde::{Deserialize, Serialize};
13+
use tracing::instrument;
14+
15+
#[derive(Serialize, Deserialize, Clone)]
16+
pub struct Hunk {
17+
/// The sha of the commit
18+
#[serde(with = "gitbutler_serde::object_id")]
19+
pub oid: gix::ObjectId,
20+
/// Header
21+
pub header: String,
22+
pub path: BString,
23+
pub previous_path: Option<BString>,
24+
pub vector: Vec<f32>,
25+
}
26+
27+
impl Hunk {
28+
// We should only ever have one entry per commit & hunk hearder
29+
pub fn key(&self) -> u64 {
30+
let mut hasher = DefaultHasher::new();
31+
(self.oid, &self.header).hash(&mut hasher);
32+
hasher.finish()
33+
}
34+
}
35+
36+
#[derive(Serialize, Deserialize, Clone)]
37+
/// Used to denote a commit has been processed
38+
pub struct Commit {
39+
/// The sha of the commit
40+
#[serde(with = "gitbutler_serde::object_id")]
41+
pub oid: gix::ObjectId,
42+
}
43+
44+
#[derive(Serialize, Deserialize)]
45+
pub struct Db {
46+
pub hunks: Vec<Hunk>,
47+
pub commits: Vec<Commit>,
48+
}
49+
50+
pub struct DbHandle {
51+
path: PathBuf,
52+
}
53+
54+
// TODO: Replace with real vector database
55+
impl DbHandle {
56+
pub fn new(ctx: &CommandContext) -> Self {
57+
Self {
58+
path: ctx.project().gb_dir().join("inspection.json"),
59+
}
60+
}
61+
62+
#[instrument(skip_all)]
63+
pub fn read(&self) -> Result<Db> {
64+
if std::fs::exists(&self.path)? {
65+
let content = std::fs::read_to_string(&self.path)?;
66+
let content: Db = serde_json::from_str(&content)?;
67+
Ok(content)
68+
} else {
69+
Ok(Db {
70+
hunks: vec![],
71+
commits: vec![],
72+
})
73+
}
74+
}
75+
76+
#[instrument(skip_all)]
77+
fn write(&self, db: &Db) -> Result<()> {
78+
let content = serde_json::to_string(db)?;
79+
std::fs::create_dir_all(self.path.parent().unwrap())?;
80+
std::fs::write(&self.path, content)?;
81+
Ok(())
82+
}
83+
84+
#[instrument(skip_all)]
85+
pub fn upsert_many_hunks(&self, entries: &[Hunk]) -> Result<Vec<Hunk>> {
86+
let mut db = self.read()?;
87+
let mut map = db
88+
.hunks
89+
.into_iter()
90+
.map(|e| (e.key(), e))
91+
.collect::<HashMap<u64, Hunk>>();
92+
93+
for e in entries {
94+
map.insert(e.key(), e.clone());
95+
}
96+
97+
db.hunks = map.into_values().collect::<Vec<_>>();
98+
99+
self.upsert_many_commits(
100+
&entries
101+
.iter()
102+
.map(|h| Commit { oid: h.oid })
103+
.collect::<Vec<Commit>>(),
104+
)?;
105+
106+
self.write(&db)?;
107+
108+
Ok(db.hunks)
109+
}
110+
111+
#[instrument(skip_all)]
112+
pub fn upsert_many_commits(&self, entries: &[Commit]) -> Result<Vec<Commit>> {
113+
let mut db = self.read()?;
114+
let mut map = db
115+
.commits
116+
.into_iter()
117+
.map(|e| (e.oid, e))
118+
.collect::<HashMap<gix::ObjectId, Commit>>();
119+
120+
for e in entries {
121+
map.insert(e.oid, e.clone());
122+
}
123+
124+
db.commits = map.into_values().collect::<Vec<_>>();
125+
126+
self.write(&db)?;
127+
128+
Ok(db.commits)
129+
}
130+
131+
// TODO: Replace with real vector db search rather than a manual implementation.
132+
#[instrument(skip_all)]
133+
pub fn search_hunks(&self, term: Vec<f32>, cutoff: Option<usize>) -> Result<Vec<(Hunk, f32)>> {
134+
let db = self.read()?;
135+
136+
let mut with_distance = db
137+
.hunks
138+
.into_iter()
139+
.map(|i| {
140+
let distance = cosine_distance(&i.vector, &term)?;
141+
Ok((i, distance))
142+
})
143+
.collect::<Result<Vec<(Hunk, f32)>>>()?;
144+
145+
// Sort decending
146+
with_distance.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
147+
148+
if let Some(cutoff) = cutoff {
149+
Ok(with_distance.into_iter().take(cutoff).collect())
150+
} else {
151+
Ok(with_distance)
152+
}
153+
}
154+
}
155+
156+
fn cosine_distance(a: &[f32], b: &[f32]) -> Result<f32> {
157+
if a.len() != b.len() {
158+
bail!("Vectors MUST be the same length!")
159+
};
160+
161+
let dot: f32 = a.iter().zip(b).map(|(a, b)| a * b).sum();
162+
let am: f32 = a.iter().fold(0.0, |acc, a| acc + (a * a)).powf(0.5);
163+
let bm: f32 = b.iter().fold(0.0, |acc, a| acc + (a * a)).powf(0.5);
164+
Ok(dot / (am * bm))
165+
}

0 commit comments

Comments
 (0)