Compile Dataset #107

Summary
Jobs
- compile
Run details
- Usage
- Workflow file

Workflow file for this run

.github/workflows/compile-dataset.yml at 5b8726b

	name: Compile Dataset

	on:
	push:
	branches: [ main ]
	paths:
	- '*/questionCode.ts'
	- '*/answer.ts'
	- '*/Note.md'
	schedule:
	# Run every Monday at 12:00 UTC
	- cron: '0 12 * * 1'
	workflow_dispatch:

	permissions:
	contents: write

	jobs:
	compile:
	runs-on: ubuntu-latest

	steps:
	- name: Checkout repository
	uses: actions/checkout@v4
	with:
	fetch-depth: 0 # Fetch all history for all branches
	token: ${{ secrets.GITHUB_TOKEN }}

	- name: Setup Node.js
	uses: actions/setup-node@v4
	with:
	node-version: '18'

	- name: Install dependencies
	run: \|
	cd scripts
	npm install

	- name: Compile dataset
	run: node scripts/compile-dataset.js

	- name: Show dataset stats
	run: \|
	echo "Dataset compilation completed!"
	echo "Dataset size: $(wc -l < data/datasets.jsonl) problems"
	echo "File size: $(du -h data/datasets.jsonl \| cut -f1)"
	echo ""
	echo "Sample entries:"
	head -2 data/datasets.jsonl \| jq '.'

	- name: Validate JSONL format
	run: \|
	echo "Validating JSONL format..."
	node -e "
	const fs = require('fs');
	const readline = require('readline');

	async function validateJSONL() {
	const fileStream = fs.createReadStream('data/datasets.jsonl');
	const rl = readline.createInterface({
	input: fileStream,
	crlfDelay: Infinity
	});

	let lineNumber = 0;
	let validEntries = 0;
	let errors = 0;

	for await (const line of rl) {
	lineNumber++;
	try {
	const data = JSON.parse(line);
	if (!data.text \|\| !data.question \|\| !data.constraints \|\| !data.thought \|\| !data.answer \|\| !data.src \|\| !data.time_complexity \|\| !data.space_complexity) {
	console.error(\`Line \${lineNumber}: Missing required fields\`);
	errors++;
	} else {
	validEntries++;
	}
	} catch (e) {
	console.error(\`Line \${lineNumber}: Invalid JSON - \${e.message}\`);
	errors++;
	}
	}

	console.log(\`Validation complete: \${validEntries} valid entries, \${errors} errors\`);
	if (errors > 0) {
	process.exit(1);
	}
	}

	validateJSONL();
	"

	- name: Create dataset branch
	run: \|
	# Create or switch to dataset branch
	git checkout -B dataset

	# Backup the compiled dataset and README before removing files
	cp data/datasets.jsonl datasets-backup.jsonl
	cp scripts/README.md.dataset readme-backup.md

	# Remove all files except the ones we want to keep
	git rm -rf . \|\| true

	# Restore essential files from main
	git checkout main -- .gitattributes .gitignore \|\| true

	# Create data directory and restore the dataset and README
	mkdir -p data
	mv datasets-backup.jsonl data/datasets.jsonl
	mv readme-backup.md README.md

	# Update .gitignore for dataset branch
	echo "# Dataset branch - only contains compiled data" > .gitignore
	echo "node_modules/" >> .gitignore
	echo "*.log" >> .gitignore
	echo ".DS_Store" >> .gitignore

	# Create .gitattributes for large files
	echo "*.jsonl filter=lfs diff=lfs merge=lfs -text" > .gitattributes
	echo "data/*.jsonl filter=lfs diff=lfs merge=lfs -text" >> .gitattributes

	# Add and commit files
	git add .
	git config --global user.name 'github-actions[bot]'
	git config --global user.email 'github-actions[bot]@users.noreply.github.com'
	git commit -m "Update dataset - $(date -u +%Y-%m-%d)" \|\| echo "No changes to commit"

	# Push to dataset branch with proper authentication
	git push https://${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git dataset --force

	- name: Setup Python for Hugging Face
	if: github.event_name == 'schedule' \|\| github.event_name == 'workflow_dispatch'
	uses: actions/setup-python@v4
	with:
	python-version: '3.9'

	- name: Install Hugging Face Hub
	if: github.event_name == 'schedule' \|\| github.event_name == 'workflow_dispatch'
	run: pip install --upgrade huggingface_hub

	# make sure we’re on the dataset branch in your Git repo (local only)
	- name: Switch to dataset branch for upload
	if: github.event_name == 'schedule' \|\| github.event_name == 'workflow_dispatch'
	run: git checkout dataset

	- name: Authenticate to Hugging Face
	if: github.event_name == 'schedule' \|\| github.event_name == 'workflow_dispatch'
	env:
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	run: \|
	pip install --upgrade "huggingface_hub>=0.24.6"
	hf auth login --token "$HF_TOKEN"
	hf auth whoami

	# ensure the dataset repo exists
	- name: Ensure dataset repo exists
	if: github.event_name == 'schedule' \|\| github.event_name == 'workflow_dispatch'
	env:
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	run: \|
	python - <<'PY'
	import os
	from huggingface_hub import create_repo
	create_repo("twinkle-ai/tw-leetcode", repo_type="dataset",
	exist_ok=True, token=os.environ["HF_TOKEN"])
	PY

	# Upload via PR to the existing 'main' branch
	- name: Upload to Hugging Face (single PR)
	if: github.event_name == 'schedule' \|\| github.event_name == 'workflow_dispatch'
	env:
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	run: \|
	set -e
	pip install --upgrade "huggingface_hub>=0.24.6"

	# login (non-interactive)
	hf auth login --token "$HF_TOKEN"

	# We are on the 'dataset' branch locally which only has what we need:
	# data/, README.md, .gitattributes
	# Push EVERYTHING at repo root in one shot to avoid multiple PRs.
	BASE_BRANCH=main
	COMMIT_MSG="Auto-update dataset - $(date -u +%Y-%m-%d)"

	# Single upload => single PR
	hf upload twinkle-ai/tw-leetcode . . \
	--repo-type dataset \
	--revision "$BASE_BRANCH" \
	--create-pr \
	--commit-message "$COMMIT_MSG"

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Compile Dataset #107

Workflow file

Compile Dataset #107

Uh oh!

Jobs

Run details

Workflow file for this run