@@ -8,6 +8,23 @@ import path from 'path';
88import { fileURLToPath } from 'url' ;
99import { fetchGithubInfo , extractGithubRepoInfo } from '../lib/githubEnrichment.js' ;
1010
11+ // Parse command line arguments
12+ const args = process . argv . slice ( 2 ) ;
13+ let BATCH_SIZE : number | null = null ; // Null means process all records
14+
15+ // Process command line arguments
16+ for ( let i = 0 ; i < args . length ; i ++ ) {
17+ if ( args [ i ] === '--batch_size' && i + 1 < args . length ) {
18+ const batchSize = parseInt ( args [ i + 1 ] , 10 ) ;
19+ if ( ! isNaN ( batchSize ) && batchSize > 0 ) {
20+ BATCH_SIZE = batchSize ;
21+ i ++ ; // Skip the next argument as it's the value
22+ } else {
23+ console . error ( `Invalid batch size: ${ args [ i + 1 ] } . Will process all records.` ) ;
24+ }
25+ }
26+ }
27+
1128// Get the directory name in ESM
1229const __filename = fileURLToPath ( import . meta. url ) ;
1330const __dirname = path . dirname ( __filename ) ;
@@ -34,11 +51,23 @@ function ensureDirectoryExists(dirPath: string): void {
3451}
3552
3653// Function to load the log file or create it if it doesn't exist
37- function loadProcessedLog ( ) : ProcessedLog {
54+ function loadProcessedLog ( allFilesCount : number ) : ProcessedLog {
3855 if ( fs . existsSync ( LOG_FILE ) ) {
3956 try {
4057 const logContent = fs . readFileSync ( LOG_FILE , 'utf8' ) ;
41- return JSON . parse ( logContent ) as ProcessedLog ;
58+ const logData = JSON . parse ( logContent ) as ProcessedLog ;
59+
60+ // Check if we've already processed all files and should start fresh
61+ if ( logData . processedFiles . length >= allFilesCount ) {
62+ console . log ( `Log file shows all ${ logData . processedFiles . length } files already processed. Starting fresh.` ) ;
63+ return {
64+ lastProcessed : new Date ( ) . toISOString ( ) ,
65+ processedFiles : [ ] ,
66+ errors : { }
67+ } ;
68+ }
69+
70+ return logData ;
4271 } catch ( error ) {
4372 console . warn ( `Error reading log file, creating a new one: ${ error } ` ) ;
4473 }
@@ -153,21 +182,22 @@ async function updateGithubInfoInFile(filePath: string): Promise<boolean> {
153182async function processAllFiles ( ) : Promise < void > {
154183 console . log ( 'Starting GitHub info update process...' ) ;
155184 console . log ( `Looking for JSON files in: ${ SPLIT_DIR } ` ) ;
185+ console . log ( BATCH_SIZE ? `Batch size set to: ${ BATCH_SIZE } ` : `Processing all remaining records` ) ;
186+
187+ // Get all JSON files from split directory (only in the root, not in language subdirectories)
188+ const allFiles = fs . readdirSync ( SPLIT_DIR )
189+ . filter ( file => file . endsWith ( '.json' ) && fs . statSync ( path . join ( SPLIT_DIR , file ) ) . isFile ( ) ) ;
190+
191+ console . log ( `Found ${ allFiles . length } total JSON files in root directory` ) ;
156192
157193 // Load processed log
158- const processedLog = loadProcessedLog ( ) ;
194+ const processedLog = loadProcessedLog ( allFiles . length ) ;
159195 console . log ( `Loaded processing log. Last run: ${ processedLog . lastProcessed } ` ) ;
160196 console . log ( `Previously processed ${ processedLog . processedFiles . length } files` ) ;
161197
162198 // Setup handlers to save progress on interruption
163199 setupShutdownHandlers ( processedLog ) ;
164200
165- // Get all JSON files from split directory (only in the root, not in language subdirectories)
166- const allFiles = fs . readdirSync ( SPLIT_DIR )
167- . filter ( file => file . endsWith ( '.json' ) && fs . statSync ( path . join ( SPLIT_DIR , file ) ) . isFile ( ) ) ;
168-
169- console . log ( `Found ${ allFiles . length } total JSON files in root directory` ) ;
170-
171201 // Filter out already processed files
172202 const filesToProcess = allFiles . filter ( file => {
173203 const hubId = getHubIdFromFilename ( file ) ;
@@ -181,11 +211,17 @@ async function processAllFiles(): Promise<void> {
181211 return ;
182212 }
183213
184- // Process each file
185- for ( const [ index , file ] of filesToProcess . entries ( ) ) {
214+ // Limit the number of files to process based on batch size if provided
215+ const filesToProcessInThisBatch = BATCH_SIZE ? filesToProcess . slice ( 0 , BATCH_SIZE ) : filesToProcess ;
216+ console . log ( BATCH_SIZE
217+ ? `Processing batch of ${ filesToProcessInThisBatch . length } files (limited by batch size ${ BATCH_SIZE } )`
218+ : `Processing all ${ filesToProcessInThisBatch . length } remaining files` ) ;
219+
220+ // Process each file in the batch
221+ for ( const [ index , file ] of filesToProcessInThisBatch . entries ( ) ) {
186222 try {
187223 const hubId = getHubIdFromFilename ( file ) ;
188- console . log ( `Processing file ${ index + 1 } /${ filesToProcess . length } : ${ file } (hubId: ${ hubId } )` ) ;
224+ console . log ( `Processing file ${ index + 1 } /${ filesToProcess . length } ${ BATCH_SIZE ? `(batch_size: ${ BATCH_SIZE } )` : '' } : ${ file } (hubId: ${ hubId } )` ) ;
189225 const filePath = path . join ( SPLIT_DIR , file ) ;
190226
191227 // Update GitHub info in the main file
@@ -227,6 +263,18 @@ async function processAllFiles(): Promise<void> {
227263 } else {
228264 console . log ( 'GitHub info update process completed successfully!' ) ;
229265 }
266+
267+ // Report on overall progress
268+ console . log ( `Processed ${ filesToProcessInThisBatch . length } files ${ BATCH_SIZE ? 'in this batch' : '' } .` ) ;
269+ console . log ( `Total progress: ${ processedLog . processedFiles . length } /${ allFiles . length } files processed.` ) ;
270+
271+ if ( processedLog . processedFiles . length < allFiles . length ) {
272+ console . log ( BATCH_SIZE
273+ ? `Run the script again to process the next batch.`
274+ : `Some files may have been skipped due to errors. Check the log file for details.` ) ;
275+ } else {
276+ console . log ( `All files have been processed. The log file will be reset on next run.` ) ;
277+ }
230278}
231279
232280// Execute the main function
0 commit comments