Skip to content

Commit d51c74e

Browse files
committed
Incorporating Encapsulation into NYCInfoHubScraper - planning to streamline by enhancing the Class
1 parent c71d68d commit d51c74e

File tree

1 file changed

+22
-22
lines changed

1 file changed

+22
-22
lines changed

src/excel_scraper.py

+22-22
Original file line numberDiff line numberDiff line change
@@ -52,21 +52,21 @@ class NYCInfoHubScraper:
5252

5353
def __init__(self, base_dir=None, data_dir=None, hash_dir=None, log_dir=None):
5454
# Initialize directories
55-
self.base_dir = base_dir or os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
56-
self.data_dir = data_dir or os.path.join(self.base_dir, "data")
57-
self.hash_dir = hash_dir or os.path.join(self.base_dir, "hashes")
58-
self.log_dir = log_dir or os.path.join(self.base_dir, "logs")
55+
self._base_dir = base_dir or os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
56+
self._data_dir = data_dir or os.path.join(self._base_dir, "data")
57+
self._hash_dir = hash_dir or os.path.join(self._base_dir, "hashes")
58+
self._log_dir = log_dir or os.path.join(self._base_dir, "logs")
5959

6060
# Re-create directories if needed
61-
os.makedirs(self.data_dir, exist_ok=True)
62-
os.makedirs(self.hash_dir, exist_ok=True)
63-
os.makedirs(self.log_dir, exist_ok=True)
61+
os.makedirs(self._data_dir, exist_ok=True)
62+
os.makedirs(self._hash_dir, exist_ok=True)
63+
os.makedirs(self._log_dir, exist_ok=True)
6464

6565
# Configure Selenium driver
66-
self.driver = self.configure_driver()
66+
self._driver = self.configure_driver()
6767

6868
# Create an async HTTP client with concurrency limits
69-
self.session = httpx.AsyncClient(
69+
self._session = httpx.AsyncClient(
7070
http2=True, limits=httpx.Limits(max_connections=80, max_keepalive_connections=40),
7171
timeout=5
7272
)
@@ -137,15 +137,15 @@ async def discover_relevant_subpages(self, url, depth=1, visited=None):
137137

138138
discovered_links = set()
139139
try:
140-
self.driver.get(url)
141-
WebDriverWait(self.driver, 5).until(
140+
self._driver.get(url)
141+
WebDriverWait(self._driver, 5).until(
142142
EC.presence_of_element_located((By.TAG_NAME, "a"))
143143
)
144144
except Exception as e:
145145
logging.error(f"❌ Error loading {url}: {e}")
146146
return discovered_links
147147

148-
anchors = self.driver.find_elements(By.TAG_NAME, "a")
148+
anchors = self._driver.find_elements(By.TAG_NAME, "a")
149149
for a in anchors:
150150
href = a.get_attribute("href")
151151
# Uses skip check to avoid unnecessary processing and quickly verify which links to skip
@@ -177,15 +177,15 @@ async def scrape_page_links(self, url, visited=None):
177177

178178
valid_links = []
179179
try:
180-
self.driver.get(url)
181-
WebDriverWait(self.driver, 10).until(
180+
self._driver.get(url)
181+
WebDriverWait(self._driver, 10).until(
182182
EC.presence_of_element_located((By.TAG_NAME, "a"))
183183
)
184184
except Exception as e:
185185
logging.error(f"❌ Error waiting for page load on {url}: {e}")
186186
return valid_links # return empty if the page failed
187187

188-
anchors = self.driver.find_elements(By.TAG_NAME, "a")
188+
anchors = self._driver.find_elements(By.TAG_NAME, "a")
189189
for a in anchors:
190190
href = a.get_attribute("href")
191191
# Skip if already visited
@@ -246,7 +246,7 @@ async def download_excel(self, url):
246246
Returns (url, content) if successful, or (url, None) otherwise.
247247
"""
248248
try:
249-
async with self.session.stream("GET", url, timeout=10) as resp:
249+
async with self._session.stream("GET", url, timeout=10) as resp:
250250
if resp.status_code == 200:
251251
# Accumulate chunks in memory (still better than reading all at once)
252252
chunks = []
@@ -319,10 +319,10 @@ def save_file(self, url, content, new_hash):
319319
file_name = os.path.basename(url)
320320
category = self.categorize_file(file_name)
321321

322-
save_path = os.path.join(self.data_dir, category, file_name)
322+
save_path = os.path.join(self._data_dir, category, file_name)
323323
os.makedirs(os.path.dirname(save_path), exist_ok=True)
324324

325-
hash_path = os.path.join(self.hash_dir, category, f"{file_name}.hash")
325+
hash_path = os.path.join(self._hash_dir, category, f"{file_name}.hash")
326326
os.makedirs(os.path.dirname(hash_path), exist_ok=True)
327327

328328
old_hash = None
@@ -345,14 +345,14 @@ def save_file(self, url, content, new_hash):
345345
async def close(self):
346346
"""Close Selenium and the async httpx session."""
347347
# Close the WebDriver
348-
if self.driver:
349-
self.driver.quit()
350-
self.driver = None
348+
if self._driver:
349+
self._driver.quit()
350+
self._driver = None
351351
logging.info("WebDriver closed.")
352352

353353
# Close the HTTPX session
354354
try:
355-
await self.session.aclose()
355+
await self._session.aclose()
356356
except Exception as e:
357357
logging.error(f"❌ Error closing session: {e}")
358358
finally:

0 commit comments

Comments
 (0)