Skip to content

Compile Dataset

Compile Dataset #107

name: Compile Dataset
on:
push:
branches: [ main ]
paths:
- '*/questionCode.ts'
- '*/answer.ts'
- '*/Note.md'
schedule:
# Run every Monday at 12:00 UTC
- cron: '0 12 * * 1'
workflow_dispatch:
permissions:
contents: write
jobs:
compile:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 0 # Fetch all history for all branches
token: ${{ secrets.GITHUB_TOKEN }}
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: '18'
- name: Install dependencies
run: |
cd scripts
npm install
- name: Compile dataset
run: node scripts/compile-dataset.js
- name: Show dataset stats
run: |
echo "Dataset compilation completed!"
echo "Dataset size: $(wc -l < data/datasets.jsonl) problems"
echo "File size: $(du -h data/datasets.jsonl | cut -f1)"
echo ""
echo "Sample entries:"
head -2 data/datasets.jsonl | jq '.'
- name: Validate JSONL format
run: |
echo "Validating JSONL format..."
node -e "
const fs = require('fs');
const readline = require('readline');
async function validateJSONL() {
const fileStream = fs.createReadStream('data/datasets.jsonl');
const rl = readline.createInterface({
input: fileStream,
crlfDelay: Infinity
});
let lineNumber = 0;
let validEntries = 0;
let errors = 0;
for await (const line of rl) {
lineNumber++;
try {
const data = JSON.parse(line);
if (!data.text || !data.question || !data.constraints || !data.thought || !data.answer || !data.src || !data.time_complexity || !data.space_complexity) {
console.error(\`Line \${lineNumber}: Missing required fields\`);
errors++;
} else {
validEntries++;
}
} catch (e) {
console.error(\`Line \${lineNumber}: Invalid JSON - \${e.message}\`);
errors++;
}
}
console.log(\`Validation complete: \${validEntries} valid entries, \${errors} errors\`);
if (errors > 0) {
process.exit(1);
}
}
validateJSONL();
"
- name: Create dataset branch
run: |
# Create or switch to dataset branch
git checkout -B dataset
# Backup the compiled dataset and README before removing files
cp data/datasets.jsonl datasets-backup.jsonl
cp scripts/README.md.dataset readme-backup.md
# Remove all files except the ones we want to keep
git rm -rf . || true
# Restore essential files from main
git checkout main -- .gitattributes .gitignore || true
# Create data directory and restore the dataset and README
mkdir -p data
mv datasets-backup.jsonl data/datasets.jsonl
mv readme-backup.md README.md
# Update .gitignore for dataset branch
echo "# Dataset branch - only contains compiled data" > .gitignore
echo "node_modules/" >> .gitignore
echo "*.log" >> .gitignore
echo ".DS_Store" >> .gitignore
# Create .gitattributes for large files
echo "*.jsonl filter=lfs diff=lfs merge=lfs -text" > .gitattributes
echo "data/*.jsonl filter=lfs diff=lfs merge=lfs -text" >> .gitattributes
# Add and commit files
git add .
git config --global user.name 'github-actions[bot]'
git config --global user.email 'github-actions[bot]@users.noreply.github.com'
git commit -m "Update dataset - $(date -u +%Y-%m-%d)" || echo "No changes to commit"
# Push to dataset branch with proper authentication
git push https://${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git dataset --force
- name: Setup Python for Hugging Face
if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
uses: actions/setup-python@v4
with:
python-version: '3.9'
- name: Install Hugging Face Hub
if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
run: pip install --upgrade huggingface_hub
# make sure we’re on the dataset branch in your Git repo (local only)
- name: Switch to dataset branch for upload
if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
run: git checkout dataset
- name: Authenticate to Hugging Face
if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
pip install --upgrade "huggingface_hub>=0.24.6"
hf auth login --token "$HF_TOKEN"
hf auth whoami
# ensure the dataset repo exists
- name: Ensure dataset repo exists
if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
python - <<'PY'
import os
from huggingface_hub import create_repo
create_repo("twinkle-ai/tw-leetcode", repo_type="dataset",
exist_ok=True, token=os.environ["HF_TOKEN"])
PY
# Upload via PR to the existing 'main' branch
- name: Upload to Hugging Face (single PR)
if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
set -e
pip install --upgrade "huggingface_hub>=0.24.6"
# login (non-interactive)
hf auth login --token "$HF_TOKEN"
# We are on the 'dataset' branch locally which only has what we need:
# data/, README.md, .gitattributes
# Push EVERYTHING at repo root in one shot to avoid multiple PRs.
BASE_BRANCH=main
COMMIT_MSG="Auto-update dataset - $(date -u +%Y-%m-%d)"
# Single upload => single PR
hf upload twinkle-ai/tw-leetcode . . \
--repo-type dataset \
--revision "$BASE_BRANCH" \
--create-pr \
--commit-message "$COMMIT_MSG"