Removed mwparserfromhell as a dependency and increments version to 0.1.2

halfak · halfak · commit 601e72bd52b9 · 2015-03-09T09:34:45.000-05:00
diff --git a/mwcites/__init__.py b/mwcites/__init__.py
@@ -1,3 +1,3 @@
 from .identifier import Identifier
 
-__version__ = "0.1.0"
+__version__ = "0.1.2"
diff --git a/mwcites/extractors/doi.py b/mwcites/extractors/doi.py
@@ -1,32 +1,34 @@
 import re
 from collections import defaultdict
 
-import mwparserfromhell as mwp
 from more_itertools import peekable
 
 from ..identifier import Identifier
 
-DOI_RE = re.compile(r'\b(10\.\d+/[^\s\|\]\}\?\,]+)')
-
-def extract_regex(text):
-    for match in DOI_RE.finditer(text):
-        id = re.sub(TAGS_RE, "", match.group(1)).rstrip(".")
-        yield Identifier("doi", id)
 
 DOI_START_RE = re.compile(r'10\.[0-9]{4,}/')
 
 HTML_TAGS = ['ref', 'span', 'div', 'table', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
-             'b', 'u', 'i', 's', 'ins', 'del', 'code', 'tt', 'blockquote',
-             'pre']
+     'b', 'u', 'i', 's', 'ins', 'del', 'code', 'tt', 'blockquote',
+     'pre']
 
 TAGS_RE = re.compile(r'<(/\s*)?(' + '|'.join(HTML_TAGS) + ')(\s[^>\n\r]+)?>', re.I)
 
+'''
+DOI_RE = re.compile(r'\b(10\.\d+/[^\s\|\]\}\?\,]+)')
 
+def extract_regex(text):
+    for match in DOI_RE.finditer(text):
+        id = re.sub(TAGS_RE, "", match.group(1)).rstrip(".")
+        yield Identifier("doi", id)
+
+import mwparserfromhell as mwp
 def extract_mwp(text):
     no_tags = mwp.parse(text).strip_code()
     for match in DOI_RE.finditer(no_tags):
         id = re.sub(TAGS_RE, "", match.group(1)).rstrip(".")
         yield Identifier("doi", id)
+'''
 
 LEXICON = [
     (DOI_START_RE.pattern, 'doi_start'),
@@ -53,21 +55,21 @@ def extract_mwp(text):
 def extract_island(text):
     tokens = tokenize_finditer(text, LEXICON)
     tokens = peekable(tokens)
-    
+
     while tokens.peek(None) is not None:
-        
+
         if tokens.peek()[0] == 'doi_start':
             yield ('doi', read_doi(tokens))
-        
+
         next(tokens)
 
 
 def tokenize_finditer(text, lexicon=LEXICON):
     pattern = '|'.join("(?P<{0}>{1})".format(name, pattern)
                        for pattern, name in lexicon)
-    
+
     group_regex = re.compile(pattern, re.I|re.U|re.M)
-    
+
     for match in group_regex.finditer(text):
         yield match.lastgroup, match.group(0)
 
@@ -84,14 +86,14 @@ def tokenize_scanner(text, lexicon=LEXICON):
 
 def read_doi(tokens):
     assert tokens.peek()[0] == 'doi_start'
-    
+
     depth = defaultdict(lambda: 0)
-    
+
     doi_buffer = [next(tokens)[1]]
-    
+
     while tokens.peek(None) is not None:
         name, match = tokens.peek()
-        
+
         if name in ('url_end', 'break', 'whitespace', 'tag', 'pipe',
                     'comment_start', 'comment_end'):
             break
@@ -115,8 +117,8 @@ def read_doi(tokens):
                 break
         else:
             doi_buffer.append(next(tokens)[1])
-        
-    
+
+
     # Do not return a doi with punctuation at the end
     return re.sub(r'[\.,!]+$', '', ''.join(doi_buffer))
 
@@ -125,16 +127,16 @@ def read_doi(tokens):
 def tokenize_search(text, start, lexicon=LEXICON):
     pattern = '|'.join("(?P<{0}>{1})".format(name, pattern)
                        for pattern, name in lexicon)
-    
+
     group_regex = re.compile(pattern, re.I|re.U)
-    
+
     match = group_regex.search(text, start)
     while match is not None:
         yield match.lastgroup, match.group(0)
         match = group_regex.search(text, match.span()[1])
 
 def extract_search(text, lexicon=LEXICON):
-    
+
     last_end = 0
     for match in DOI_START_RE.finditer(text):
         if match.span()[0] > last_end:

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`from .identifier import Identifier`
`2`	`2`
`3`		`-__version__ = "0.1.0"`
	`3`	`+__version__ = "0.1.2"`