firebolt-db · crcarreras · Jul 8, 2025 · fb-cole · Jul 10, 2025
diff --git a/benchmarks/FireScale/databricks/benchmark.sql b/benchmarks/FireScale/databricks/benchmark.sql
diff --git a/benchmarks/FireScale/databricks/queries.json b/benchmarks/FireScale/databricks/queries.json
diff --git a/benchmarks/FireScale/databricks/setup.sql b/benchmarks/FireScale/databricks/setup.sql
@@ -0,0 +1,96 @@
+DROP AGGREGATING INDEX IF EXISTS idx_by_day;
+DROP TABLE IF EXISTS uservisits;
+DROP TABLE IF EXISTS rankings;
+DROP TABLE IF EXISTS ipaddresses;
+DROP TABLE IF EXISTS agents;
+DROP TABLE IF EXISTS searchwords;   
+
+CREATE TABLE "uservisits" ("sourceip" text NOT NULL, 
+"destinationurl" text NOT NULL,
+"visitdate" pgdate NOT NULL,
+"adrevenue" REAL NOT NULL, 
+"useragent" text NOT NULL, 
+"countrycode" text NOT NULL,
+"languagecode" text NOT NULL,
+"searchword" text NOT NULL, 
+"duration" integer NOT NULL) 
+PRIMARY INDEX "visitdate", "destinationurl", "sourceip";
+
+CREATE TABLE "ipaddresses" ("ip" text NOT NULL,
+"autonomoussystem" integer NOT NULL,
+"asname" text NOT NULL)
+PRIMARY INDEX "ip";
+
+CREATE TABLE "rankings" ("pageurl" text NOT NULL,
+"pagerank" integer NULL,
+"avgduration" integer NOT NULL) 
+PRIMARY INDEX "pageurl";
+
+CREATE TABLE "agents" ("id" integer NOT NULL,
+"agentname" text NOT NULL,
+"operatingsystem" text NOT NULL,
+"devicearch" text NOT NULL,
+"browser" text NOT NULL);
+
+CREATE TABLE "searchwords" ("word" text NOT NULL,
+"word_hash" bigint NOT NULL,
+"word_id" bigint NOT NULL,
+"firstseen" pgdate NOT NULL,
+"is_topic" boolean NOT NULL);
+
+COPY
+INTO
+	uservisits
+FROM
+	's3://firebolt-benchmarks-requester-pays-us-east-1/firenewt/1tb/uservisits/gz-parquet/'
+WITH
+	CREDENTIALS = (AWS_ROLE_ARN = 'arn:aws:iam::442042532160:role/FireboltS3DatasetsAccess')
+	TYPE = parquet;
+
+COPY
+INTO
+	rankings
+FROM
+	's3://firebolt-benchmarks-requester-pays-us-east-1/firenewt/1tb/rankings/'
+WITH
+	CREDENTIALS = (AWS_ROLE_ARN = 'arn:aws:iam::442042532160:role/FireboltS3DatasetsAccess')
+	TYPE = parquet;
+
+COPY
+INTO
+	ipaddresses
+FROM
+	's3://firebolt-benchmarks-requester-pays-us-east-1/firenewt/1tb/dimensions/ipaddresses/'
+WITH
+	CREDENTIALS = (AWS_ROLE_ARN = 'arn:aws:iam::442042532160:role/FireboltS3DatasetsAccess')
+	TYPE = parquet;
+
+COPY
+INTO
+	agents
+FROM
+	's3://firebolt-benchmarks-requester-pays-us-east-1/firenewt/1tb/dimensions/agents/'
+WITH
+	CREDENTIALS = (AWS_ROLE_ARN = 'arn:aws:iam::442042532160:role/FireboltS3DatasetsAccess')
+	TYPE = parquet;
+
+COPY
+INTO
+	searchwords
+FROM
+	's3://firebolt-benchmarks-requester-pays-us-east-1/firenewt/1tb/dimensions/searchwords/'
+WITH
+	CREDENTIALS = (AWS_ROLE_ARN = 'arn:aws:iam::442042532160:role/FireboltS3DatasetsAccess')
+	TYPE = parquet;
+
+VACUUM uservisits;
+
+VACUUM uservisits;
+
+VACUUM rankings;
+
+VACUUM searchwords;
+
+VACUUM agents;
+
+VACUUM ipaddresses;
diff --git a/clients/python/src/connectors/__init__.py b/clients/python/src/connectors/__init__.py
@@ -2,12 +2,14 @@
 from .firebolt import FireboltConnector
 from .redshift import RedshiftConnector
 from .snowflake import SnowflakeConnector
+from .databricks import DatabricksConnector
 
 __all__ = [
     "FireboltConnector",
     "SnowflakeConnector",
     "BigQueryConnector",
     "RedshiftConnector",
+    "DatabricksConnector"
 ]
 
 
@@ -18,6 +20,7 @@ def get_connector_class(vendor: str):
         "firebolt": FireboltConnector,
         "bigquery": BigQueryConnector,
         "redshift": RedshiftConnector,
+        "databricks": DatabricksConnector
     }
     if vendor not in connector_map:
         raise ValueError(f"Unsupported vendor: {vendor}")

diff --git a/clients/python/src/connectors/databricks.py b/clients/python/src/connectors/databricks.py
@@ -0,0 +1,67 @@
+from databricks import sql
+from typing import Any, Dict, Optional, List
+
+class DatabricksConnector:
+    def __init__(self, config: Dict[str, str]):
+        """
+        Initialize Databricks connector with configuration parameters.
+
+        Args:
+            config (Dict[str, str]): Configuration dictionary containing:
+            "server_hostname": "your sql warehouse hostname",
+            "http_path": "http path for warehouse",
+            "access_token": "your databricks personal access token",
+            "catalog": "Databricks warehouse name",
+            "schema": "Databricks schema name"
+        """
+        self.config = config
+        self._validate_config()
+        self._conn = None
+        self.cursor = None
+
+    def _validate_config(self) -> None:
+        """Validate that required configuration parameters are present."""
+        required_params = ['server_hostname', 'http_path', 'access_token', 'catalog', 'schema']
+        missing_params = [param for param in required_params if param not in self.config]
+        if missing_params:
+            raise ValueError(f"Missing required configuration parameters: {missing_params}")
+
+    def connect(self) -> None:
+        """Connect to Databricks using stored configuration."""
+        if not self._conn:
+            self._conn = sql.connect(
+                server_hostname=self.config['server_hostname'],
+                http_path=self.config['http_path'],
+                access_token=self.config['access_token'],
+                catalog=self.config['catalog'],
+                schema=self.config['schema']
+            )
+            self.cursor = self._conn.cursor()
+            self.cursor.execute("SET use_cached_result = false;")
+
+    def execute_query(self, query: str, params: Optional[Dict[str, Any]] = None) -> List[Dict]:
+        """
+        Execute a SQL query and return results as a list of dictionaries.
+
+        Args:
+            query (str): SQL query to execute
+            params (Optional[Dict[str, Any]]): Query parameters for parameterized queries
+
+        Returns:
+            List[Dict]: Query results as a list of dictionaries
+        """
+        if not self._conn or not self.cursor:
+            self.connect()
+
+        try:
+            self.cursor.execute(query, params or {})
+            return self.cursor.fetchall()
+        except Exception as e:
+            raise Exception(f"Error executing query: {str(e)}")
+
+    def close(self) -> None:
+        """Close the Databricks connection if it exists."""
+        if self._conn:
+            self._conn.close()
+            self._conn = None
+            self.cursor = None
diff --git a/config/credentials/sample_credentials.json b/config/credentials/sample_credentials.json
@@ -27,5 +27,12 @@
         "project_id": "your_project_id",
         "dataset": "your_dataset",
         "key": "your json key generated from google cloud"
+    },
+    "databricks": {
+        "server_hostname": "your Databricks SQL warehouse server hostname",
+        "http_path": "your Databricks SQL warehouse http path",
+        "access_token": "your Databricks personal access token",
+        "catalog": "your Databricks catalog",
+        "schema": "your Databricks schema"
     }
 }