@@ -52,21 +52,21 @@ class NYCInfoHubScraper:
52
52
53
53
def __init__ (self , base_dir = None , data_dir = None , hash_dir = None , log_dir = None ):
54
54
# Initialize directories
55
- self .base_dir = base_dir or os .path .abspath (os .path .join (os .path .dirname (__file__ ), ".." ))
56
- self .data_dir = data_dir or os .path .join (self .base_dir , "data" )
57
- self .hash_dir = hash_dir or os .path .join (self .base_dir , "hashes" )
58
- self .log_dir = log_dir or os .path .join (self .base_dir , "logs" )
55
+ self ._base_dir = base_dir or os .path .abspath (os .path .join (os .path .dirname (__file__ ), ".." ))
56
+ self ._data_dir = data_dir or os .path .join (self ._base_dir , "data" )
57
+ self ._hash_dir = hash_dir or os .path .join (self ._base_dir , "hashes" )
58
+ self ._log_dir = log_dir or os .path .join (self ._base_dir , "logs" )
59
59
60
60
# Re-create directories if needed
61
- os .makedirs (self .data_dir , exist_ok = True )
62
- os .makedirs (self .hash_dir , exist_ok = True )
63
- os .makedirs (self .log_dir , exist_ok = True )
61
+ os .makedirs (self ._data_dir , exist_ok = True )
62
+ os .makedirs (self ._hash_dir , exist_ok = True )
63
+ os .makedirs (self ._log_dir , exist_ok = True )
64
64
65
65
# Configure Selenium driver
66
- self .driver = self .configure_driver ()
66
+ self ._driver = self .configure_driver ()
67
67
68
68
# Create an async HTTP client with concurrency limits
69
- self .session = httpx .AsyncClient (
69
+ self ._session = httpx .AsyncClient (
70
70
http2 = True , limits = httpx .Limits (max_connections = 80 , max_keepalive_connections = 40 ),
71
71
timeout = 5
72
72
)
@@ -137,15 +137,15 @@ async def discover_relevant_subpages(self, url, depth=1, visited=None):
137
137
138
138
discovered_links = set ()
139
139
try :
140
- self .driver .get (url )
141
- WebDriverWait (self .driver , 5 ).until (
140
+ self ._driver .get (url )
141
+ WebDriverWait (self ._driver , 5 ).until (
142
142
EC .presence_of_element_located ((By .TAG_NAME , "a" ))
143
143
)
144
144
except Exception as e :
145
145
logging .error (f"❌ Error loading { url } : { e } " )
146
146
return discovered_links
147
147
148
- anchors = self .driver .find_elements (By .TAG_NAME , "a" )
148
+ anchors = self ._driver .find_elements (By .TAG_NAME , "a" )
149
149
for a in anchors :
150
150
href = a .get_attribute ("href" )
151
151
# Uses skip check to avoid unnecessary processing and quickly verify which links to skip
@@ -177,15 +177,15 @@ async def scrape_page_links(self, url, visited=None):
177
177
178
178
valid_links = []
179
179
try :
180
- self .driver .get (url )
181
- WebDriverWait (self .driver , 10 ).until (
180
+ self ._driver .get (url )
181
+ WebDriverWait (self ._driver , 10 ).until (
182
182
EC .presence_of_element_located ((By .TAG_NAME , "a" ))
183
183
)
184
184
except Exception as e :
185
185
logging .error (f"❌ Error waiting for page load on { url } : { e } " )
186
186
return valid_links # return empty if the page failed
187
187
188
- anchors = self .driver .find_elements (By .TAG_NAME , "a" )
188
+ anchors = self ._driver .find_elements (By .TAG_NAME , "a" )
189
189
for a in anchors :
190
190
href = a .get_attribute ("href" )
191
191
# Skip if already visited
@@ -246,7 +246,7 @@ async def download_excel(self, url):
246
246
Returns (url, content) if successful, or (url, None) otherwise.
247
247
"""
248
248
try :
249
- async with self .session .stream ("GET" , url , timeout = 10 ) as resp :
249
+ async with self ._session .stream ("GET" , url , timeout = 10 ) as resp :
250
250
if resp .status_code == 200 :
251
251
# Accumulate chunks in memory (still better than reading all at once)
252
252
chunks = []
@@ -319,10 +319,10 @@ def save_file(self, url, content, new_hash):
319
319
file_name = os .path .basename (url )
320
320
category = self .categorize_file (file_name )
321
321
322
- save_path = os .path .join (self .data_dir , category , file_name )
322
+ save_path = os .path .join (self ._data_dir , category , file_name )
323
323
os .makedirs (os .path .dirname (save_path ), exist_ok = True )
324
324
325
- hash_path = os .path .join (self .hash_dir , category , f"{ file_name } .hash" )
325
+ hash_path = os .path .join (self ._hash_dir , category , f"{ file_name } .hash" )
326
326
os .makedirs (os .path .dirname (hash_path ), exist_ok = True )
327
327
328
328
old_hash = None
@@ -345,14 +345,14 @@ def save_file(self, url, content, new_hash):
345
345
async def close (self ):
346
346
"""Close Selenium and the async httpx session."""
347
347
# Close the WebDriver
348
- if self .driver :
349
- self .driver .quit ()
350
- self .driver = None
348
+ if self ._driver :
349
+ self ._driver .quit ()
350
+ self ._driver = None
351
351
logging .info ("WebDriver closed." )
352
352
353
353
# Close the HTTPX session
354
354
try :
355
- await self .session .aclose ()
355
+ await self ._session .aclose ()
356
356
except Exception as e :
357
357
logging .error (f"❌ Error closing session: { e } " )
358
358
finally :
0 commit comments