Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4,906 changes: 3,907 additions & 999 deletions Cargo.lock

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ tracing = "0.1.41"
tracing-subscriber = "0.3.19"
tempfile = "3.14"
rand = "0.9.1"
async-openai = "0.28.3"
tiktoken-rs="0.7"

gitbutler-id = { path = "crates/gitbutler-id" }
gitbutler-git = { path = "crates/gitbutler-git" }
Expand Down Expand Up @@ -68,6 +70,7 @@ gitbutler-forge = { path = "crates/gitbutler-forge" }
gitbutler-hunk-dependency = { path = "crates/gitbutler-hunk-dependency" }
but-settings = { path = "crates/but-settings" }
gitbutler-workspace = { path = "crates/gitbutler-workspace" }
but-inspection = { path = "crates/but-inspection" }
but = { path = "crates/but" }
but-testsupport = { path = "crates/but-testsupport" }
but-rebase = { path = "crates/but-rebase" }
Expand Down
2 changes: 1 addition & 1 deletion crates/but-action/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ test = false
[dependencies]
serde = { workspace = true, features = ["std"] }
serde-error = "0.1.3"
async-openai = "0.28.2"
async-openai.workspace = true
tokio = { workspace = true, features = ["rt-multi-thread", "io-std"] }
schemars = "0.9.0"
serde_json = "1.0.138"
Expand Down
41 changes: 41 additions & 0 deletions crates/but-inspection/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
[package]
name = "but-inspection"
version = "0.0.0"
edition = "2024"
authors = ["GitButler <[email protected]>"]
publish = false

[lib]
doctest = false

[features]
# Set when building in test-mode to enable features that help with generating repeatable tests.
testing = []

[dependencies]
serde = { workspace = true, features = ["std"] }
bstr.workspace = true
tracing.workspace = true
anyhow = "1.0.98"
gix = { workspace = true, features = [
"dirwalk",
"credentials",
"parallel",
"serde",
"status",
] }
async-openai.workspace = true
tiktoken-rs.workspace = true
gitbutler-serde.workspace = true
gitbutler-error.workspace = true
gitbutler-command-context.workspace = true
uuid.workspace = true
toml.workspace = true
serde_json = "1.0.140"
but-core.workspace = true
lancedb = "0.20"
# Should match what lancedb depends on
arrow-array = "55.1"
arrow-schema = "55.1"

[dev-dependencies]
165 changes: 165 additions & 0 deletions crates/but-inspection/src/db.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
//! This is due to be replaced with a real vector database. I just got bored
//! with trying to get them to work.
use std::{
collections::HashMap,
hash::{DefaultHasher, Hash, Hasher as _},
path::PathBuf,
};

use anyhow::{Result, bail};
use bstr::BString;
use gitbutler_command_context::CommandContext;
use serde::{Deserialize, Serialize};
use tracing::instrument;

#[derive(Serialize, Deserialize, Clone)]
pub struct Hunk {
/// The sha of the commit
#[serde(with = "gitbutler_serde::object_id")]
pub oid: gix::ObjectId,
/// Header
pub header: String,
pub path: BString,
pub previous_path: Option<BString>,
pub vector: Vec<f32>,
}

impl Hunk {
// We should only ever have one entry per commit & hunk hearder
pub fn key(&self) -> u64 {
let mut hasher = DefaultHasher::new();
(self.oid, &self.header).hash(&mut hasher);
hasher.finish()
}
}

#[derive(Serialize, Deserialize, Clone)]
/// Used to denote a commit has been processed
pub struct Commit {
/// The sha of the commit
#[serde(with = "gitbutler_serde::object_id")]
pub oid: gix::ObjectId,
}

#[derive(Serialize, Deserialize)]
pub struct Db {
pub hunks: Vec<Hunk>,
pub commits: Vec<Commit>,
}

pub struct DbHandle {
path: PathBuf,
}

// TODO: Replace with real vector database
impl DbHandle {
pub fn new(ctx: &CommandContext) -> Self {
Self {
path: ctx.project().gb_dir().join("inspection.json"),
}
}

#[instrument(skip_all)]
pub fn read(&self) -> Result<Db> {
if std::fs::exists(&self.path)? {
let content = std::fs::read_to_string(&self.path)?;
let content: Db = serde_json::from_str(&content)?;
Ok(content)
} else {
Ok(Db {
hunks: vec![],
commits: vec![],
})
}
}

#[instrument(skip_all)]
fn write(&self, db: &Db) -> Result<()> {
let content = serde_json::to_string(db)?;
std::fs::create_dir_all(self.path.parent().unwrap())?;
std::fs::write(&self.path, content)?;
Ok(())
}

#[instrument(skip_all)]
pub fn upsert_many_hunks(&self, entries: &[Hunk]) -> Result<Vec<Hunk>> {
let mut db = self.read()?;
let mut map = db
.hunks
.into_iter()
.map(|e| (e.key(), e))
.collect::<HashMap<u64, Hunk>>();

for e in entries {
map.insert(e.key(), e.clone());
}

db.hunks = map.into_values().collect::<Vec<_>>();

self.upsert_many_commits(
&entries
.iter()
.map(|h| Commit { oid: h.oid })
.collect::<Vec<Commit>>(),
)?;

self.write(&db)?;

Ok(db.hunks)
}

#[instrument(skip_all)]
pub fn upsert_many_commits(&self, entries: &[Commit]) -> Result<Vec<Commit>> {
let mut db = self.read()?;
let mut map = db
.commits
.into_iter()
.map(|e| (e.oid, e))
.collect::<HashMap<gix::ObjectId, Commit>>();

for e in entries {
map.insert(e.oid, e.clone());
}

db.commits = map.into_values().collect::<Vec<_>>();

self.write(&db)?;

Ok(db.commits)
}

// TODO: Replace with real vector db search rather than a manual implementation.
#[instrument(skip_all)]
pub fn search_hunks(&self, term: Vec<f32>, cutoff: Option<usize>) -> Result<Vec<(Hunk, f32)>> {
let db = self.read()?;

let mut with_distance = db
.hunks
.into_iter()
.map(|i| {
let distance = cosine_distance(&i.vector, &term)?;
Ok((i, distance))
})
.collect::<Result<Vec<(Hunk, f32)>>>()?;

// Sort decending
with_distance.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());

if let Some(cutoff) = cutoff {
Ok(with_distance.into_iter().take(cutoff).collect())
} else {
Ok(with_distance)
}
}
}

fn cosine_distance(a: &[f32], b: &[f32]) -> Result<f32> {
if a.len() != b.len() {
bail!("Vectors MUST be the same length!")
};

let dot: f32 = a.iter().zip(b).map(|(a, b)| a * b).sum();
let am: f32 = a.iter().fold(0.0, |acc, a| acc + (a * a)).powf(0.5);
let bm: f32 = b.iter().fold(0.0, |acc, a| acc + (a * a)).powf(0.5);
Ok(dot / (am * bm))
}
Loading
Loading