Skip to content

Commit 601e72b

Browse files
committed
Removed mwparserfromhell as a dependency and increments version to 0.1.2
1 parent 011e905 commit 601e72b

File tree

2 files changed

+26
-24
lines changed

2 files changed

+26
-24
lines changed

mwcites/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
from .identifier import Identifier
22

3-
__version__ = "0.1.0"
3+
__version__ = "0.1.2"

mwcites/extractors/doi.py

Lines changed: 25 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,34 @@
11
import re
22
from collections import defaultdict
33

4-
import mwparserfromhell as mwp
54
from more_itertools import peekable
65

76
from ..identifier import Identifier
87

9-
DOI_RE = re.compile(r'\b(10\.\d+/[^\s\|\]\}\?\,]+)')
10-
11-
def extract_regex(text):
12-
for match in DOI_RE.finditer(text):
13-
id = re.sub(TAGS_RE, "", match.group(1)).rstrip(".")
14-
yield Identifier("doi", id)
158

169
DOI_START_RE = re.compile(r'10\.[0-9]{4,}/')
1710

1811
HTML_TAGS = ['ref', 'span', 'div', 'table', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
19-
'b', 'u', 'i', 's', 'ins', 'del', 'code', 'tt', 'blockquote',
20-
'pre']
12+
'b', 'u', 'i', 's', 'ins', 'del', 'code', 'tt', 'blockquote',
13+
'pre']
2114

2215
TAGS_RE = re.compile(r'<(/\s*)?(' + '|'.join(HTML_TAGS) + ')(\s[^>\n\r]+)?>', re.I)
2316

17+
'''
18+
DOI_RE = re.compile(r'\b(10\.\d+/[^\s\|\]\}\?\,]+)')
2419
20+
def extract_regex(text):
21+
for match in DOI_RE.finditer(text):
22+
id = re.sub(TAGS_RE, "", match.group(1)).rstrip(".")
23+
yield Identifier("doi", id)
24+
25+
import mwparserfromhell as mwp
2526
def extract_mwp(text):
2627
no_tags = mwp.parse(text).strip_code()
2728
for match in DOI_RE.finditer(no_tags):
2829
id = re.sub(TAGS_RE, "", match.group(1)).rstrip(".")
2930
yield Identifier("doi", id)
31+
'''
3032

3133
LEXICON = [
3234
(DOI_START_RE.pattern, 'doi_start'),
@@ -53,21 +55,21 @@ def extract_mwp(text):
5355
def extract_island(text):
5456
tokens = tokenize_finditer(text, LEXICON)
5557
tokens = peekable(tokens)
56-
58+
5759
while tokens.peek(None) is not None:
58-
60+
5961
if tokens.peek()[0] == 'doi_start':
6062
yield ('doi', read_doi(tokens))
61-
63+
6264
next(tokens)
6365

6466

6567
def tokenize_finditer(text, lexicon=LEXICON):
6668
pattern = '|'.join("(?P<{0}>{1})".format(name, pattern)
6769
for pattern, name in lexicon)
68-
70+
6971
group_regex = re.compile(pattern, re.I|re.U|re.M)
70-
72+
7173
for match in group_regex.finditer(text):
7274
yield match.lastgroup, match.group(0)
7375

@@ -84,14 +86,14 @@ def tokenize_scanner(text, lexicon=LEXICON):
8486

8587
def read_doi(tokens):
8688
assert tokens.peek()[0] == 'doi_start'
87-
89+
8890
depth = defaultdict(lambda: 0)
89-
91+
9092
doi_buffer = [next(tokens)[1]]
91-
93+
9294
while tokens.peek(None) is not None:
9395
name, match = tokens.peek()
94-
96+
9597
if name in ('url_end', 'break', 'whitespace', 'tag', 'pipe',
9698
'comment_start', 'comment_end'):
9799
break
@@ -115,8 +117,8 @@ def read_doi(tokens):
115117
break
116118
else:
117119
doi_buffer.append(next(tokens)[1])
118-
119-
120+
121+
120122
# Do not return a doi with punctuation at the end
121123
return re.sub(r'[\.,!]+$', '', ''.join(doi_buffer))
122124

@@ -125,16 +127,16 @@ def read_doi(tokens):
125127
def tokenize_search(text, start, lexicon=LEXICON):
126128
pattern = '|'.join("(?P<{0}>{1})".format(name, pattern)
127129
for pattern, name in lexicon)
128-
130+
129131
group_regex = re.compile(pattern, re.I|re.U)
130-
132+
131133
match = group_regex.search(text, start)
132134
while match is not None:
133135
yield match.lastgroup, match.group(0)
134136
match = group_regex.search(text, match.span()[1])
135137

136138
def extract_search(text, lexicon=LEXICON):
137-
139+
138140
last_end = 0
139141
for match in DOI_START_RE.finditer(text):
140142
if match.span()[0] > last_end:

0 commit comments

Comments
 (0)