Skip to content

Conversation

dliu04
Copy link

@dliu04 dliu04 commented Aug 2, 2025

Implements check_escape.py script to detect "Wrong escape" issues as specified in GitHub issue #149. The script identifies:

  • Incomplete LaTeX commands (\c, \p, \l)
  • Invalid escape characters (-)
  • Improper quote escaping
  • Unknown escape sequences

@koppor
Copy link
Member

koppor commented Aug 3, 2025

@koppor
Copy link
Member

koppor commented Aug 14, 2025

Review hint: Does this PR update existing fetched lists? If yes: Discuss alternatives (no failing PR, just report -> workflow summary; plus report to upstream list provider)

Comment on lines +35 to +62
for file in fileNames:
if (file.endswith(".csv")):
# For each .csv file in the folder, open in read mode
with open(PATH_TO_JOURNALS + file, "r", encoding='utf-8', errors='ignore') as f:
for i, line in enumerate(f):
# Look for specific problematic patterns
problematic_patterns = [
(r'\\c(?![primeyrd])', 'incomplete LaTeX command - should be \\cyr or \\cprime'),
(r'\\p(?!olhk)', 'incomplete LaTeX command - should be \\polhk'),
(r'\\l(?!dots|asp)', 'incomplete LaTeX command'),
(r'\\"[^,"]', 'improper quote escaping'),
(r'\\(?![\\"/nrt$&-]|sp|rm|circledR|cprime|cyr|polhk|cdprime|ldots|lasp)[a-zA-Z]+', 'unknown escape sequence'),
]

for pattern, description in problematic_patterns:
matches = re.finditer(pattern, line)
for match in matches:
# Skip if we're inside a mathematical expression (between $ signs)
line_before_match = line[:match.start()]
line_after_match = line[match.end():]
dollar_count_before = line_before_match.count('$')
dollar_count_after = line_after_match.count('$')

# If we have an odd number of $ before and after, we're inside math - allow it
if (dollar_count_before % 2 == 1) and (dollar_count_after % 2 == 1):
continue

errFileNames.append(file)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
for file in fileNames:
if (file.endswith(".csv")):
# For each .csv file in the folder, open in read mode
with open(PATH_TO_JOURNALS + file, "r", encoding='utf-8', errors='ignore') as f:
for i, line in enumerate(f):
# Look for specific problematic patterns
problematic_patterns = [
(r'\\c(?![primeyrd])', 'incomplete LaTeX command - should be \\cyr or \\cprime'),
(r'\\p(?!olhk)', 'incomplete LaTeX command - should be \\polhk'),
(r'\\l(?!dots|asp)', 'incomplete LaTeX command'),
(r'\\"[^,"]', 'improper quote escaping'),
(r'\\(?![\\"/nrt$&-]|sp|rm|circledR|cprime|cyr|polhk|cdprime|ldots|lasp)[a-zA-Z]+', 'unknown escape sequence'),
]
for pattern, description in problematic_patterns:
matches = re.finditer(pattern, line)
for match in matches:
# Skip if we're inside a mathematical expression (between $ signs)
line_before_match = line[:match.start()]
line_after_match = line[match.end():]
dollar_count_before = line_before_match.count('$')
dollar_count_after = line_after_match.count('$')
# If we have an odd number of $ before and after, we're inside math - allow it
if (dollar_count_before % 2 == 1) and (dollar_count_after % 2 == 1):
continue
errFileNames.append(file)
for fileName in fileNames:
if (fileName.endswith(".csv")):
# For each .csv file in the folder, open in read mode
with open(PATH_TO_JOURNALS + fileName, "r", encoding='utf-8', errors='ignore') as f:
for i, line in enumerate(f):
# Look for specific problematic patterns
problematic_patterns = [
(r'\\c(?![primeyrd])', 'incomplete LaTeX command - should be \\cyr or \\cprime'),
(r'\\p(?!olhk)', 'incomplete LaTeX command - should be \\polhk'),
(r'\\l(?!dots|asp)', 'incomplete LaTeX command'),
(r'\\"[^,"]', 'improper quote escaping'),
(r'\\(?![\\"/nrt$&-]|sp|rm|circledR|cprime|cyr|polhk|cdprime|ldots|lasp)[a-zA-Z]+', 'unknown escape sequence'),
]
for pattern, description in problematic_patterns:
matches = re.finditer(pattern, line)
for match in matches:
# Skip if we're inside a mathematical expression (between $ signs)
line_before_match = line[:match.start()]
line_after_match = line[match.end():]
dollar_count_before = line_before_match.count('$')
dollar_count_after = line_after_match.count('$')
# If we have an odd number of $ before and after, we're inside math - allow it
if (dollar_count_before % 2 == 1) and (dollar_count_after % 2 == 1):
continue
errFileNames.append(fileName)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants