Compile Dataset #107
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Compile Dataset | |
| on: | |
| push: | |
| branches: [ main ] | |
| paths: | |
| - '*/questionCode.ts' | |
| - '*/answer.ts' | |
| - '*/Note.md' | |
| schedule: | |
| # Run every Monday at 12:00 UTC | |
| - cron: '0 12 * * 1' | |
| workflow_dispatch: | |
| permissions: | |
| contents: write | |
| jobs: | |
| compile: | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| with: | |
| fetch-depth: 0 # Fetch all history for all branches | |
| token: ${{ secrets.GITHUB_TOKEN }} | |
| - name: Setup Node.js | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: '18' | |
| - name: Install dependencies | |
| run: | | |
| cd scripts | |
| npm install | |
| - name: Compile dataset | |
| run: node scripts/compile-dataset.js | |
| - name: Show dataset stats | |
| run: | | |
| echo "Dataset compilation completed!" | |
| echo "Dataset size: $(wc -l < data/datasets.jsonl) problems" | |
| echo "File size: $(du -h data/datasets.jsonl | cut -f1)" | |
| echo "" | |
| echo "Sample entries:" | |
| head -2 data/datasets.jsonl | jq '.' | |
| - name: Validate JSONL format | |
| run: | | |
| echo "Validating JSONL format..." | |
| node -e " | |
| const fs = require('fs'); | |
| const readline = require('readline'); | |
| async function validateJSONL() { | |
| const fileStream = fs.createReadStream('data/datasets.jsonl'); | |
| const rl = readline.createInterface({ | |
| input: fileStream, | |
| crlfDelay: Infinity | |
| }); | |
| let lineNumber = 0; | |
| let validEntries = 0; | |
| let errors = 0; | |
| for await (const line of rl) { | |
| lineNumber++; | |
| try { | |
| const data = JSON.parse(line); | |
| if (!data.text || !data.question || !data.constraints || !data.thought || !data.answer || !data.src || !data.time_complexity || !data.space_complexity) { | |
| console.error(\`Line \${lineNumber}: Missing required fields\`); | |
| errors++; | |
| } else { | |
| validEntries++; | |
| } | |
| } catch (e) { | |
| console.error(\`Line \${lineNumber}: Invalid JSON - \${e.message}\`); | |
| errors++; | |
| } | |
| } | |
| console.log(\`Validation complete: \${validEntries} valid entries, \${errors} errors\`); | |
| if (errors > 0) { | |
| process.exit(1); | |
| } | |
| } | |
| validateJSONL(); | |
| " | |
| - name: Create dataset branch | |
| run: | | |
| # Create or switch to dataset branch | |
| git checkout -B dataset | |
| # Backup the compiled dataset and README before removing files | |
| cp data/datasets.jsonl datasets-backup.jsonl | |
| cp scripts/README.md.dataset readme-backup.md | |
| # Remove all files except the ones we want to keep | |
| git rm -rf . || true | |
| # Restore essential files from main | |
| git checkout main -- .gitattributes .gitignore || true | |
| # Create data directory and restore the dataset and README | |
| mkdir -p data | |
| mv datasets-backup.jsonl data/datasets.jsonl | |
| mv readme-backup.md README.md | |
| # Update .gitignore for dataset branch | |
| echo "# Dataset branch - only contains compiled data" > .gitignore | |
| echo "node_modules/" >> .gitignore | |
| echo "*.log" >> .gitignore | |
| echo ".DS_Store" >> .gitignore | |
| # Create .gitattributes for large files | |
| echo "*.jsonl filter=lfs diff=lfs merge=lfs -text" > .gitattributes | |
| echo "data/*.jsonl filter=lfs diff=lfs merge=lfs -text" >> .gitattributes | |
| # Add and commit files | |
| git add . | |
| git config --global user.name 'github-actions[bot]' | |
| git config --global user.email 'github-actions[bot]@users.noreply.github.com' | |
| git commit -m "Update dataset - $(date -u +%Y-%m-%d)" || echo "No changes to commit" | |
| # Push to dataset branch with proper authentication | |
| git push https://${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git dataset --force | |
| - name: Setup Python for Hugging Face | |
| if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' | |
| uses: actions/setup-python@v4 | |
| with: | |
| python-version: '3.9' | |
| - name: Install Hugging Face Hub | |
| if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' | |
| run: pip install --upgrade huggingface_hub | |
| # make sure we’re on the dataset branch in your Git repo (local only) | |
| - name: Switch to dataset branch for upload | |
| if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' | |
| run: git checkout dataset | |
| - name: Authenticate to Hugging Face | |
| if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' | |
| env: | |
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
| run: | | |
| pip install --upgrade "huggingface_hub>=0.24.6" | |
| hf auth login --token "$HF_TOKEN" | |
| hf auth whoami | |
| # ensure the dataset repo exists | |
| - name: Ensure dataset repo exists | |
| if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' | |
| env: | |
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
| run: | | |
| python - <<'PY' | |
| import os | |
| from huggingface_hub import create_repo | |
| create_repo("twinkle-ai/tw-leetcode", repo_type="dataset", | |
| exist_ok=True, token=os.environ["HF_TOKEN"]) | |
| PY | |
| # Upload via PR to the existing 'main' branch | |
| - name: Upload to Hugging Face (single PR) | |
| if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' | |
| env: | |
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
| run: | | |
| set -e | |
| pip install --upgrade "huggingface_hub>=0.24.6" | |
| # login (non-interactive) | |
| hf auth login --token "$HF_TOKEN" | |
| # We are on the 'dataset' branch locally which only has what we need: | |
| # data/, README.md, .gitattributes | |
| # Push EVERYTHING at repo root in one shot to avoid multiple PRs. | |
| BASE_BRANCH=main | |
| COMMIT_MSG="Auto-update dataset - $(date -u +%Y-%m-%d)" | |
| # Single upload => single PR | |
| hf upload twinkle-ai/tw-leetcode . . \ | |
| --repo-type dataset \ | |
| --revision "$BASE_BRANCH" \ | |
| --create-pr \ | |
| --commit-message "$COMMIT_MSG" |