dylanpicart
diff --git a/‎.github/workflows/ci-cd.yml
Lines changed: 66 additions & 0 deletions b/‎.github/workflows/ci-cd.yml
Lines changed: 66 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 42 additions & 11 deletions b/‎README.md
Lines changed: 42 additions & 11 deletions
diff --git a/‎pyproject.toml
Lines changed: 3 additions & 0 deletions b/‎pyproject.toml
Lines changed: 3 additions & 0 deletions
diff --git a/‎requirements.txt
192 Bytes b/‎requirements.txt
192 Bytes
diff --git a/‎setup.py
Lines changed: 10 additions & 6 deletions b/‎setup.py
Lines changed: 10 additions & 6 deletions
diff --git a/‎__init__.py renamed to ‎src/__init__.py b/‎__init__.py renamed to ‎src/__init__.py
diff --git a/‎url_scraper.py renamed to ‎src/excel_scraper.py
Lines changed: 12 additions & 10 deletions b/‎url_scraper.py renamed to ‎src/excel_scraper.py
Lines changed: 12 additions & 10 deletions
diff --git a/‎nyc_infohub.py renamed to ‎src/main.py
Lines changed: 16 additions & 25 deletions b/‎nyc_infohub.py renamed to ‎src/main.py
Lines changed: 16 additions & 25 deletions
diff --git a/‎tests/conftest.py
Lines changed: 27 additions & 0 deletions b/‎tests/conftest.py
Lines changed: 27 additions & 0 deletions
@@ -0,0 +1,66 @@
+name: CI-CD  # CI/CD workflow
+
+on:
+  push:
+    branches: [ "main" ]      # Trigger on pushes to 'main'
+  pull_request:
+    branches: [ "main" ]      # Trigger on PRs targeting 'main'
+
+jobs:
+  build-test:                 # Our first job for building and testing
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Check out code  # Check out your repository
+        uses: actions/checkout@v3
+
+      - name: Set up Python   # Install desired Python version
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.11"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          # Editable install of your package so your code in src/ 
+          # is recognized as a Python package
+          pip install -e .
+          # If you have additional dev/test dependencies, 
+          # either put them in setup.py or:
+          # pip install -r requirements.txt
+
+      - name: Run tests
+        run: |
+          # Use python -m pytest to ensure we use the same Python interpreter
+          python -m pytest tests/
+
+  deploy:                      # Second job for "CD" or deployment
+    needs: [ build-test ]      # Only run if 'build-test' succeeds
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.11"
+
+      - name: Build distribution
+        run: |
+          python -m pip install --upgrade pip
+          pip install build twine  # Tools needed to build & upload your package
+          python -m build          # Creates dist/*.whl and dist/*.tar.gz
+
+      - name: Publish to PyPI
+        # Sample checks if the push is a tagged release.
+        if: startsWith(github.ref, 'refs/tags/')
+        env:
+          TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
+          TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
+        run: |
+          # By default, this will push to PyPI.
+          # For TestPyPI, pass --repository-url or set env var:
+          # python -m twine upload --repository-url https://test.pypi.org/legacy/ dist/*
+          python -m twine upload dist/*
@@ -1,8 +1,8 @@
-# NYC InfoHub Excel Data Scraper
+# Excel API Web Scraper
 
 ## Description
 
-**NYC InfoHub Excel Data Scraper** is a Python-based project that automates the process of web scraping, downloading, and storing Excel files from the NYC InfoHub website. The scraper dynamically discovers subpages, detects relevant Excel links (filtered by year), downloads them asynchronously, and ensures that only new or changed files are saved.
+**Excel API Web Scraper** is a Python-based project that automates the process of web scraping, downloading, and storing Excel files from the NYC InfoHub website. The scraper dynamically discovers subpages, detects relevant Excel links (filtered by year), downloads them asynchronously, and ensures that only new or changed files are saved.
 
 This version features:
 - **Asynchronous HTTP/2 downloads** via `httpx.AsyncClient`
@@ -13,8 +13,6 @@ This version features:
 
 ---
 
-**Important Note**: The previous iteration was fully functional and efficient, however relied too heavily on hardcoding. In this version, I use RegEx patterns to parse through sub-pages.
-
 ## Features
 
 - **Web Scraping with Selenium**  
@@ -59,10 +57,10 @@ Dependencies:
 - `httpx[http2]`: For performing asynchronous HTTP requests and HTTP/2 support
 - `selenium`: For web scraping
 - `pandas`: For processing Excel files
-- `requests`: For downloading files
 - `tqdm`: To display download progress
 - `concurrent.futures`: For multithreading
 - `openpyxl`, `pyxlsb`, `xlrd`: For handling different Excel file types
+- `pytest`, `pytest-asyncio`, `pytest-cov`: For module testing 
 ```
 
 ---
@@ -73,20 +71,24 @@ Dependencies:
 project_root/
 │
 ├── __init__.py             # Package initializer
+├── .github                 # Workflow CI/CD integration
 ├── .gitignore              # Ignore logs, venv, data, and cache files
 ├── .env                    # Environment variables (excluded from version control)
 ├── README.md               # Project documentation
 ├── requirements.txt        # Project dependencies
 ├── setup.py                # Project packaging file
+├── pyproject.toml          # Specify build system requirements
 ├── LICENSE                 # License file
 │
 ├── venv/                   # Virtual environment (ignored by version control)
-│
-├── nyc_infohub.py          # Main scraper script
-├── url_scraper.py          # Web scraping module
+│   
+├── src/
+│   ├── main.py             # Main scraper script
+│   └── excel_scraper.py    # Web scraping module
 │
 ├── logs/                   # Directory for log files
-│   └── excel_fetch.log
+│
+├── tests/                  # Directory for unit, integration, and end-to-end testing   
 │
 ├── data/                   # Directory for downloaded Excel files
 │   ├── graduation/
@@ -106,7 +108,7 @@ This structure ensures that the project is well-organized for both manual execut
 ### **Running the Scraper Manually**
 1. **Run the script to scrape and fetch new datasets:**
    ```bash
-   python nyc_infohub.py
+   python main.py
    ```
 2. **View logs for download status and debugging:**
    ```bash
@@ -145,6 +147,34 @@ This structure ensures that the project is well-organized for both manual execut
 
 ---
 
+## Testing
+
+We use **Pytest** for our test suite, located in the `tests/` folder.
+
+1. **Install dev/test dependencies** (either in your `setup.py` or via `pip install -r requirements.txt` if you listed them there).
+
+2. **Run tests**:
+```bash
+python -m pytest tests/
+```
+
+3. **View Coverage** (if you have `pytest-cov`):
+```bash
+python -m pytest tests/ --cov=src
+```
+
+---
+
+## CI/CD Pipeline
+
+A GitHub Actions workflow is set up in `.github/workflows/ci-cd.yml`. It:
+
+1. **Builds and tests** the project on push or pull request to the `main` branch.
+2. If tests pass and you push a **tagged release**, it **builds a distribution** and can **upload** to PyPI using **Twine** (when secrets are configured).
+3. Check the **Actions** tab on your repo to see logs and statuses of each workflow run.
+
+---
+
 ## **Previous Limitations and Solutions**
 ***Bottlenecks***:
 
@@ -157,6 +187,7 @@ This structure ensures that the project is well-organized for both manual execut
 1. Optimized Downloading: Parallel downloads using asyncio and ThreadPoolExecutor allow multiple downloads to happen concurrently, improving speed.
 2. Persistent HTTP Sessions: Using httpx.AsyncClient ensures that HTTP connections are reused, reducing overhead.
 3. Efficient Hashing: Files are saved only if they have changed, determined by a computed hash. This ensures no unnecessary downloads.
+4. Excluded older datasets by added `re` filtering logic to scrape only the latest available data.
 
 ---
 
@@ -169,7 +200,7 @@ This structure ensures that the project is well-organized for both manual execut
 ---
 
 ## **Other Potential Improvements**
-- **Exclude older datasets**: Add filtering logic to scrape only the latest available data.
+- **Add NYSed Website**: Scrape data from NYSed.
 - **Email Notifications**: Notify users when a new dataset is fetched.
 - **Database Integration**: Store metadata in a database for better tracking.
 - **Better Exception Handling**: Improve error logging for specific failures.
 
@@ -0,0 +1,3 @@
+[build-system]
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta"
@@ -1,24 +1,28 @@
 from setuptools import setup, find_packages
 
 setup(
-    name="nyc_infohub_scraper",
-    version="1.0.0",
+    name="excel_api_access",
+    version="1.0.3",
     author="Dylan Picart",
     author_email="[email protected]",
     description="A Python scraper for downloading Excel datasets from NYC InfoHub.",
     long_description=open("README.md").read(),
     long_description_content_type="text/markdown",
-    url="https://github.com/dylanpicart/nyc_infohub_scraper",
-    packages=find_packages(),
+    url="https://github.com/dylanpicart/excel_api_access",
+    packages=find_packages(where="src"),
+    package_dir={"": "src"},
     install_requires=[
+        "httpx[http2]>=0.28.1",
         "selenium>=4.10.0",
         "pandas>=1.3.0",
-        "requests>=2.26.0",
         "tqdm>=4.62.0",
         "openpyxl>=3.0.9",
         "pyxlsb>=1.0.10",
         "xlrd>=2.0.1",
-        "python-dotenv>=1.0.0"
+        "python-dotenv>=1.0.0",
+        "pytest>=7.0, <8.0",
+        "pytest-asyncio",
+        "pytest-cov"
     ],
     classifiers=[
         "Programming Language :: Python :: 3",
 
@@ -32,10 +32,6 @@
                      "information-and-data-overview"
 }
 
-BASE_DIR = os.path.dirname(os.path.abspath(__file__))
-DATA_DIR = os.path.join(BASE_DIR, "data")
-HASH_DIR = os.path.join(BASE_DIR, "hashes")
-LOG_DIR = os.path.join(BASE_DIR, "logs")
 
 # -------------------- NYCInfoHubScraperc Class --------------------
 class NYCInfoHubScraper:
@@ -47,11 +43,17 @@ class NYCInfoHubScraper:
         "other_reports": []  # Default category for uncategorized files
     }
 
-    def __init__(self):
+    def __init__(self, base_dir=None, data_dir=None, hash_dir=None, log_dir=None):
+        # Initialize directories
+        self.base_dir = base_dir or os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+        self.data_dir = data_dir or os.path.join(self.base_dir, "data")
+        self.hash_dir = hash_dir or os.path.join(self.base_dir, "hashes")
+        self.log_dir = log_dir or os.path.join(self.base_dir, "logs")
+
         # Re-create directories if needed
-        os.makedirs(DATA_DIR, exist_ok=True)
-        os.makedirs(HASH_DIR, exist_ok=True)
-        os.makedirs(LOG_DIR, exist_ok=True)
+        os.makedirs(self.data_dir, exist_ok=True)
+        os.makedirs(self.hash_dir, exist_ok=True)
+        os.makedirs(self.log_dir, exist_ok=True)
 
         # Configure Selenium driver
         self.driver = self.configure_driver()
@@ -266,10 +268,10 @@ def save_file(self, url, content, new_hash):
         file_name = os.path.basename(url)
         category = self.categorize_file(file_name)
 
-        save_path = os.path.join(DATA_DIR, category, file_name)
+        save_path = os.path.join(self.data_dir, category, file_name)
         os.makedirs(os.path.dirname(save_path), exist_ok=True)
 
-        hash_path = os.path.join(HASH_DIR, category, f"{file_name}.hash")
+        hash_path = os.path.join(self.hash_dir, category, f"{file_name}.hash")
         os.makedirs(os.path.dirname(hash_path), exist_ok=True)
 
         old_hash = None
 
@@ -1,32 +1,9 @@
 import os
 import logging
 import asyncio
-from url_scraper import NYCInfoHubScraper
+from excel_scraper import NYCInfoHubScraper
 from logging.handlers import RotatingFileHandler
 
-# -------------------- CONFIGURATION --------------------
-BASE_DIR = os.path.dirname(os.path.abspath(__file__))
-DATA_DIR = os.path.join(BASE_DIR, "data")
-HASH_DIR = os.path.join(BASE_DIR, "hashes")
-LOG_DIR = os.path.join(BASE_DIR, "logs")
-LOG_FILE_PATH = os.path.join(LOG_DIR, "excel_fetch.log")
-
-# Ensure necessary directories exist
-os.makedirs(DATA_DIR, exist_ok=True)
-os.makedirs(HASH_DIR, exist_ok=True)
-os.makedirs(LOG_DIR, exist_ok=True)
-
-# Set up Rotating Log Handler
-log_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
-rotating_handler = RotatingFileHandler(LOG_FILE_PATH, maxBytes=5 * 1024 * 1024, backupCount=2)
-rotating_handler.setFormatter(log_formatter)
-
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s - %(levelname)s - %(message)s",
-    handlers=[rotating_handler, logging.StreamHandler()]
-)
-
 
 # -------------------- SCRAPER EXECUTION --------------------
 async def main():
@@ -61,5 +38,19 @@ async def main():
 
 # Run the scraper process
 if __name__ == "__main__":
-    logging.basicConfig(level=logging.INFO)
+    base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+    logs_dir = os.path.join(base_dir, "logs")
+    os.makedirs(logs_dir, exist_ok=True)
+
+    # (2) Create the rotating log handler
+    log_file_path = os.path.join(logs_dir, "excel_fetch.log")
+    rotating_handler = RotatingFileHandler(log_file_path, maxBytes=5_242_880, backupCount=2)
+    rotating_handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
+
+    # (3) Call basicConfig once, referencing your rotating handler
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s - %(levelname)s - %(message)s",
+        handlers=[rotating_handler, logging.StreamHandler()]
+    )    
     asyncio.run(main())
@@ -0,0 +1,27 @@
+# conftest.py
+
+import pytest
+import logging
+import asyncio
+from src.excel_scraper import NYCInfoHubScraper
+
+
+
+@pytest.fixture(scope="session")
+def test_scraper():
+    """
+    A session-scoped fixture that returns a NYCInfoHubScraper instance.
+    Any test can use 'test_scraper' as a parameter, and it will share
+    the same instance if scope="session".
+    """
+    logging.info("Setting up NYCInfoHubScraper for tests.")
+    scraper = NYCInfoHubScraper()
+    yield scraper  # run tests using this instance
+
+    # Teardown code after tests finish
+    logging.info("Tearing down NYCInfoHubScraper after tests.")
+    # Safely close the scraper's resources:
+    try:
+        asyncio.run(scraper.close())
+    except Exception as e:
+        logging.error(f"Error closing scraper during teardown: {e}")
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+[build-system]`
	`2`	`+requires = ["setuptools", "wheel"]`
	`3`	`+build-backend = "setuptools.build_meta"`