1
1
import re
2
2
from collections import defaultdict
3
3
4
- import mwparserfromhell as mwp
5
4
from more_itertools import peekable
6
5
7
6
from ..identifier import Identifier
8
7
9
- DOI_RE = re .compile (r'\b(10\.\d+/[^\s\|\]\}\?\,]+)' )
10
-
11
- def extract_regex (text ):
12
- for match in DOI_RE .finditer (text ):
13
- id = re .sub (TAGS_RE , "" , match .group (1 )).rstrip ("." )
14
- yield Identifier ("doi" , id )
15
8
16
9
DOI_START_RE = re .compile (r'10\.[0-9]{4,}/' )
17
10
18
11
HTML_TAGS = ['ref' , 'span' , 'div' , 'table' , 'h1' , 'h2' , 'h3' , 'h4' , 'h5' , 'h6' ,
19
- 'b' , 'u' , 'i' , 's' , 'ins' , 'del' , 'code' , 'tt' , 'blockquote' ,
20
- 'pre' ]
12
+ 'b' , 'u' , 'i' , 's' , 'ins' , 'del' , 'code' , 'tt' , 'blockquote' ,
13
+ 'pre' ]
21
14
22
15
TAGS_RE = re .compile (r'<(/\s*)?(' + '|' .join (HTML_TAGS ) + ')(\s[^>\n \r ]+)?>' , re .I )
23
16
17
+ '''
18
+ DOI_RE = re.compile(r'\b (10\.\d+/[^\s\|\]\}\?\,]+)')
24
19
20
+ def extract_regex(text):
21
+ for match in DOI_RE.finditer(text):
22
+ id = re.sub(TAGS_RE, "", match.group(1)).rstrip(".")
23
+ yield Identifier("doi", id)
24
+
25
+ import mwparserfromhell as mwp
25
26
def extract_mwp(text):
26
27
no_tags = mwp.parse(text).strip_code()
27
28
for match in DOI_RE.finditer(no_tags):
28
29
id = re.sub(TAGS_RE, "", match.group(1)).rstrip(".")
29
30
yield Identifier("doi", id)
31
+ '''
30
32
31
33
LEXICON = [
32
34
(DOI_START_RE .pattern , 'doi_start' ),
@@ -53,21 +55,21 @@ def extract_mwp(text):
53
55
def extract_island (text ):
54
56
tokens = tokenize_finditer (text , LEXICON )
55
57
tokens = peekable (tokens )
56
-
58
+
57
59
while tokens .peek (None ) is not None :
58
-
60
+
59
61
if tokens .peek ()[0 ] == 'doi_start' :
60
62
yield ('doi' , read_doi (tokens ))
61
-
63
+
62
64
next (tokens )
63
65
64
66
65
67
def tokenize_finditer (text , lexicon = LEXICON ):
66
68
pattern = '|' .join ("(?P<{0}>{1})" .format (name , pattern )
67
69
for pattern , name in lexicon )
68
-
70
+
69
71
group_regex = re .compile (pattern , re .I | re .U | re .M )
70
-
72
+
71
73
for match in group_regex .finditer (text ):
72
74
yield match .lastgroup , match .group (0 )
73
75
@@ -84,14 +86,14 @@ def tokenize_scanner(text, lexicon=LEXICON):
84
86
85
87
def read_doi (tokens ):
86
88
assert tokens .peek ()[0 ] == 'doi_start'
87
-
89
+
88
90
depth = defaultdict (lambda : 0 )
89
-
91
+
90
92
doi_buffer = [next (tokens )[1 ]]
91
-
93
+
92
94
while tokens .peek (None ) is not None :
93
95
name , match = tokens .peek ()
94
-
96
+
95
97
if name in ('url_end' , 'break' , 'whitespace' , 'tag' , 'pipe' ,
96
98
'comment_start' , 'comment_end' ):
97
99
break
@@ -115,8 +117,8 @@ def read_doi(tokens):
115
117
break
116
118
else :
117
119
doi_buffer .append (next (tokens )[1 ])
118
-
119
-
120
+
121
+
120
122
# Do not return a doi with punctuation at the end
121
123
return re .sub (r'[\.,!]+$' , '' , '' .join (doi_buffer ))
122
124
@@ -125,16 +127,16 @@ def read_doi(tokens):
125
127
def tokenize_search (text , start , lexicon = LEXICON ):
126
128
pattern = '|' .join ("(?P<{0}>{1})" .format (name , pattern )
127
129
for pattern , name in lexicon )
128
-
130
+
129
131
group_regex = re .compile (pattern , re .I | re .U )
130
-
132
+
131
133
match = group_regex .search (text , start )
132
134
while match is not None :
133
135
yield match .lastgroup , match .group (0 )
134
136
match = group_regex .search (text , match .span ()[1 ])
135
137
136
138
def extract_search (text , lexicon = LEXICON ):
137
-
139
+
138
140
last_end = 0
139
141
for match in DOI_START_RE .finditer (text ):
140
142
if match .span ()[0 ] > last_end :
0 commit comments