Add a script to automatically add publications to website (#4)

EmilyBourne · EmilyBourne · commit c0512dbcec67 · 2025-09-18T17:02:29.000+02:00
Add a script which runs via the CI once a month. The script uses OpenAlex to search for all relevant articles. It does this by searching for the keywords "gysela", "gyselax" and "gyselalib" amongst articles published since a given date. The date is 2019-01-01 by default (which corresponds with the most recent article currently indexed), but it is usually equal to the date at which the workflow last ran successfully minus 2 months. It filters the articles found to: - Remove preprints - Keep articles with the keyword in the title or abstract - Keep articles where one of the authors is one of the people in https://github.com/gyselax/gyselax.github.io/tree/main/content/authors For each of the remaining articles a `cite.bib` file and an `index.md` file are created in an appropriately named sub-folder in https://github.com/gyselax/gyselax.github.io/tree/main/content/publication if any publications are added then a branch is created with these changes and a PR is opened. If PRs are merged regularly the same article should not appear in more than 1 PR. Once the PR has been created it can be pruned manually before merging if articles were inappropriately added.
diff --git a/.github/workflows/check-for-new-publications.yml b/.github/workflows/check-for-new-publications.yml
@@ -0,0 +1,75 @@
+name: Update Publications
+
+on:
+  schedule:
+    # Runs at 03:00 on the first day of each month
+    - cron: '0 3 1 * *'
+  workflow_dispatch:  # allows manual trigger
+
+jobs:
+  update-publications:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install requests pyyaml unidecode
+      - name: Get last run date
+        run: |
+          last_run_iso=$(gh run list --workflow "Update Publications" --status success --limit 1 --json createdAt --jq '.[0].createdAt'  2>/dev/null || echo "")
+          if [ -z "$last_run_iso" ]; then
+            echo "No last run found"
+            last_run_iso="2019-01-01T00:00:00Z"   # fallback default
+          fi
+          # Remove 2 months to allow lots of time for indexing
+          CHECK_FROM=$(date -u -d "$last_run_iso -2 months" +"%Y-%m-%d")
+          echo "CHECK_FROM=$CHECK_FROM"
+          echo "CHECK_FROM=$CHECK_FROM" >> $GITHUB_ENV
+      - name: Create branch
+        run: |
+          branch_name="update-publications-$(date +'%Y%m%d')"
+          echo "branch_name=${branch_name}" >> $GITHUB_ENV
+      - name: Run publication update script
+        run: python scripts/update_publications.py
+      - name: Check for changes
+        id: check_changes
+        run: |
+          NEW_FILES=$(git ls-files --other --exclude-standard content/publication)
+          if [ -z "${NEW_FILES}" ]; then
+            echo "No new publications found."
+            echo "has_new=false" >> $GITHUB_OUTPUT
+          else
+            echo "has_new=true" >> $GITHUB_OUTPUT
+          fi
+      - name: Commit changes
+        if: steps.check_new.outputs.has_new == 'true'
+        run: |
+          git checkout main
+          git checkout -b $branch_name
+          git add content/publication
+          git commit -m "Automated update of publications" || echo "No changes to commit"
+      - name: Push branch
+        if: steps.check_new.outputs.has_new == 'true'
+        run: git push origin HEAD
+        env:
+          GH_TOKEN: ${{ github.token }}
+      - name: Create Pull Request
+        if: steps.check_new.outputs.has_new == 'true'
+        run: |
+          branch_name=$(git rev-parse --abbrev-ref HEAD)
+          gh pr create \
+            --title "Update publications" \
+            --body "Automated update of publications since ${CHECK_FROM}." \
+            --base main \
+            --head $branch_name
+        env:
+          GH_TOKEN: ${{ github.token }}
diff --git a/scripts/update_publications.py b/scripts/update_publications.py
@@ -0,0 +1,242 @@
+import requests
+import os
+import re
+import datetime
+import yaml
+from pathlib import Path
+import unidecode
+
+# === Config ===
+PROJECT_NAME = "gysela"     # change to your project keyword
+AUTHOR_DIR = Path(__file__).parent.parent / "content" / "authors"
+PUBLICATION_DIR = Path(__file__).parent.parent / "content" / "publication"
+VENUE_ABBREVIATIONS_FILE = Path("venue_abbreviations.yml")
+CHECK_FROM=os.environ['CHECK_FROM']
+
+existing_slugs = {p.stem for p in PUBLICATION_DIR.iterdir() if p.is_dir()}
+
+# === Helpers ===
+def load_abbrev_map():
+    if VENUE_ABBREVIATIONS_FILE.exists():
+        with open(VENUE_ABBREVIATIONS_FILE) as f:
+            return yaml.safe_load(f)
+    return {}
+
+def load_key_authors():
+    key_authors = []
+    for md_file in AUTHOR_DIR.rglob("*.md"):
+        with open(md_file, encoding="utf-8") as f:
+            content = f.read()
+        if content.startswith("---"):
+            front_matter = content.split("---", 2)[1]
+            data = yaml.safe_load(front_matter)
+            if "name" in data and "organizations" in data:
+                orgs = [o["name"] for o in data.get("organizations", []) if "name" in o]
+                key_authors.append({
+                    "name": " ".join(data["name"].split(" ")[1:]),
+                    "organizations": orgs
+                })
+    return key_authors
+
+def load_known_dois():
+    dois = set()
+    for md_file in PUBLICATION_DIR.rglob("*.md"):
+        with open(md_file, encoding="utf-8") as f:
+            content = f.read()
+        if content.startswith("---"):
+            front_matter = content.split("---", 2)[1]
+            data = yaml.safe_load(front_matter)
+            if "doi" in data:
+                dois.add(data["doi"])
+    return dois
+
+def get_first_author_surname(authorships):
+    if authorships:
+        first_author = authorships[0]["author"]["display_name"]
+        surname = first_author.split()[-1]
+        return unidecode.unidecode(surname).lower()
+    return "unknown"
+
+def get_all_authors(authorships):
+    return " and ".join(a["author"]["display_name"] for a in authorships) if authorships else "Unknown"
+
+def author_matches(work_authorships, key_authors):
+    for a in work_authorships:
+        author_name = a["raw_author_name"]
+        institutions = [i["raw_affiliation_string"] for i in a["affiliations"]]
+        for ka in key_authors:
+            if (ka["name"].lower() in author_name.lower()) and \
+                    any(org.lower() in instit.lower() for org in ka["organizations"] for instit in institutions):
+                return True
+    return False
+
+def make_slug(meta, abbrev_map):
+    surname = get_first_author_surname(meta["authorships"])
+    if meta["venue_full"] in abbrev_map:
+        venue = abbrev_map[meta["venue_full"]]['slug']
+    else:
+        venue = meta["venue_full"]
+    year = str(meta["year"])
+    slug_base = f"{surname}-{venue}-{year}"
+    slug = slug_base
+    i = 2
+    while slug in existing_slugs:
+        slug = f'{slug_base}_{i}'
+        i+=1
+    existing_slugs.add(slug)
+    return slug
+
+def extract_metadata(work, abbrev_map):
+    """Extract shared metadata for front_matter and bibtex."""
+    title = work.get("title", "")
+    authorships = work.get("authorships", [])
+    authors_list = [a["author"]["display_name"] for a in authorships]
+    authors_bibtex = get_all_authors(authorships)
+    surname = get_first_author_surname(authorships)
+    venue_host = work.get("host_venue", {}).get("display_name")
+    venue_primary = work.get("primary_location", {})
+    if venue_primary:
+        venue_primary = venue_primary.get("source", {})
+        if venue_primary:
+            venue_primary = venue_primary.get("display_name")
+    venue_full = venue_primary or venue_host or ""
+    year = work.get("publication_year", "")
+    doi = work.get("doi")
+    url = f"https://doi.org/{doi}" if doi else None
+    pub_date = work.get("publication_date", "1900-01-01")
+    biblio = work.get("biblio", {})
+    volume = biblio.get("volume")
+    issue = biblio.get("issue")
+    first_page = biblio.get("first_page")
+    last_page = biblio.get("last_page")
+    pages = f"{first_page}--{last_page}" if first_page and last_page else None
+    abstract = work.get("abstract_inverted_index") and " ".join(work["abstract_inverted_index"].keys()) or ""
+    return {
+        "title": title,
+        "authors_list": authors_list,
+        "authors_bibtex": authors_bibtex,
+        "authorships": authorships,
+        "venue_full": venue_full,
+        "year": year,
+        "doi": doi,
+        "url": url,
+        "pub_date": pub_date,
+        "volume": volume,
+        "issue": issue,
+        "pages": pages,
+        "surname": surname,
+        "abstract": abstract
+    }
+
+def to_bibtex(meta, slug, abbrev_map):
+    if meta["venue_full"] in abbrev_map:
+        venue = abbrev_map[meta["venue_full"]]['bibtex']
+    else:
+        venue = meta["venue_full"]
+    fields = {
+        "title": meta["title"],
+        "author": meta["authors_bibtex"],
+        "journal": venue, 
+        "year": meta["year"],
+        "volume": meta["volume"],
+        "number": meta["issue"],
+        "pages": meta["pages"],
+        "doi": meta["doi"],
+        "url": meta["url"]
+    }
+    lines = [f"@article{{{slug},"]
+    lines.extend(f"  {k} = {{{v}}}," for k, v in fields.items() if v)
+    lines[-1] = lines[-1].rstrip(",")  # drop trailing comma
+    lines.append("}")
+    return "\n".join(lines)
+
+def write_index_md(folder, meta):
+    front_matter = {
+        "title": meta["title"],
+        "subtitle": "",
+        "summary": "",
+        "authors": meta["authors_list"],
+        "tags": [],
+        "categories": [],
+        "date": meta["pub_date"],
+        "lastmod": datetime.datetime.now().isoformat(),
+        "featured": False,
+        "draft": False,
+        "image": {"caption": "", "focal_point": "", "preview_only": False},
+        "projects": [],
+        "publishDate": datetime.datetime.now().isoformat(),
+        "publication_types": ["1"],
+        "abstract": meta["abstract"],
+        "publication": meta["venue_full"],
+        "doi": meta["doi"] or ""
+    }
+    index_md = "---\n" + yaml.dump(front_matter, sort_keys=False) + "---\n"
+    (folder / "index.md").write_text(index_md, encoding="utf-8")
+
+# === Main ===
+def main():
+    abbrev_map = load_abbrev_map()
+    key_authors = load_key_authors()
+    dois = load_known_dois()
+
+    for PROJECT_NAME in ('gysela', 'gyselax', 'gyselalib'):
+        url = "https://api.openalex.org/works"
+        params = {
+            "search": PROJECT_NAME,
+            "filter": f"from_publication_date:{CHECK_FROM}",
+            "per-page": 100
+        }
+        response = requests.get(url, params=params)
+        response.raise_for_status()
+        data = response.json()
+        results = data.get("results", [])
+        print(f"Found {len(results)} results for {PROJECT_NAME} since {CHECK_FROM}")
+
+        for work in results:
+            # Discard preprints
+            if work.get("type") == "preprint":
+                continue
+
+            meta = extract_metadata(work, abbrev_map)
+
+            # Discard preprints
+            if "arxiv" in meta["venue_full"].lower():
+                continue
+
+            # Check relevance
+            gysela_in_title = PROJECT_NAME in meta["title"].lower()
+            gysela_in_abstract = PROJECT_NAME in meta["abstract"].lower()
+            written_by_key_author = author_matches(meta["authorships"], key_authors)
+            if not (gysela_in_title or gysela_in_abstract) and \
+                    not written_by_key_author:
+                print("Discarding citation : ", meta["title"], meta["authors_list"])
+                continue
+
+            # Discard if already found
+            if meta["doi"] in dois:
+                continue
+            dois.add(meta["doi"])
+
+            print("Saving :")
+            print("    ", meta["title"])
+            print("    ", meta["authors_list"])
+            if gysela_in_title or gysela_in_abstract:
+                print("Mentioning Gysela prominently")
+            if written_by_key_author:
+                print("Written by permanent contributor")
+            print()
+
+            slug = make_slug(meta, abbrev_map)
+            folder = PUBLICATION_DIR / slug
+            folder.mkdir(parents=True, exist_ok=True)
+
+            # Write index.md
+            write_index_md(folder, meta)
+
+            # Write cite.bib
+            bibtex = to_bibtex(meta, slug, abbrev_map)
+            (folder / "cite.bib").write_text(bibtex, encoding="utf-8")
+
+if __name__ == "__main__":
+    main()
+
diff --git a/scripts/venue_abbreviations.yml b/scripts/venue_abbreviations.yml
@@ -0,0 +1,36 @@
+Journal of Computational Physics:
+  slug: jcp
+  bibtex: "J. Comput. Phys."
+Journal of Plasma Physics:
+  slug: jpp
+  bibtex: "J. Plasma Phys."
+Computer Physics Communications:
+  slug: cpc
+  bibtex: "Comput. Phys. Commun."
+Concurrency and Computation Practice and Experience:
+  slug: ccpe
+  bibtex: "Concurrency and Computation Practice and Experience"
+Plasma Physics and Controlled Fusion:
+  slug: ppcf
+  bibtex: "Plasma Phys. Controlled Fusion"
+SMAI Journal of Computational Mathematics:
+  slug: smai
+  bibtex: "SMAI Journal of Computational Mathematics"
+Communications Physics:
+  slug: cp
+  bibtex: "Commun. Phys."
+The International Journal of High Performance Computing Applications:
+  slug: ijhpca
+  bibtex: "Int. J. High Perform. Comput. Appl."
+Physics of Plasmas:
+  slug: po-p
+  bibtex: "Phys. Plasmas"
+Nuclear Fusion:
+  slug: nf
+  bibtex: "Nucl. Fusion"
+Physical review. E:
+  slug: pre
+  bibtex: "Phys. Rev. E"
+Physical Review Letters:
+  slug: prl
+  bibtex: "Phys. Rev. Lett."