diff --git a/requirements.txt b/requirements.txt
index 0ac3552..59a4daa 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
argparse==1.2.1
PyYAML==3.10
beautifulsoup4==4.2.0
+six==1.10.0
diff --git a/tests.py b/tests.py
index 891b35e..d32d932 100644
--- a/tests.py
+++ b/tests.py
@@ -1,11 +1,12 @@
# encoding=utf-8
+import six
import twitter_text, sys, os, json, argparse, re
from twitter_text.unicode import force_unicode
narrow_build = True
try:
- unichr(0x20000)
+ six.unichr(0x20000)
narrow_build = False
except:
pass
@@ -177,4 +178,4 @@ def assert_equal(result, test):
sys.stdout.write(u'\033[0m-------\n\033[92m%d tests passed.\033[0m\n' % attempted)
sys.stdout.flush()
-sys.exit(os.EX_OK)
\ No newline at end of file
+sys.exit(os.EX_OK)
diff --git a/twitter_text/autolink.py b/twitter_text/autolink.py
index 17b32c5..9d31aed 100644
--- a/twitter_text/autolink.py
+++ b/twitter_text/autolink.py
@@ -1,5 +1,5 @@
# encoding=utf-8
-
+from __future__ import unicode_literals
import re, cgi
from twitter_text.regex import REGEXEN
@@ -113,7 +113,7 @@ def auto_link_entities(self, entities = [], options = {}):
return self.text
# NOTE deprecate these attributes not options keys in options hash, then use html_attrs
- options = dict(DEFAULT_OPTIONS.items() + options.items())
+ options = dict(list(DEFAULT_OPTIONS.items()) + list(options.items()))
options['html_attrs'] = self._extract_html_attrs_from_options(options)
if not options.get('suppress_no_follow', False):
options['html_attrs']['rel'] = "nofollow"
@@ -302,16 +302,16 @@ def _link_url_with_entity(self, entity, options = {}):
For those URLs, display_url is not a substring of expanded_url, so we don't do anything special to render the elided parts.
For a pic.twitter.com URL, the only elided part will be the "https://", so this is fine.
"""
- display_url = entity.get('display_url').decode('utf-8')
+ display_url = entity.get('display_url')
expanded_url = entity.get('expanded_url')
invisible_tag_attrs = options.get('invisible_tag_attrs', DEFAULT_INVISIBLE_TAG_ATTRS)
- display_url_sans_ellipses = re.sub(ur'…', u'', display_url)
+ display_url_sans_ellipses = re.sub(r'…', '', display_url)
if expanded_url.find(display_url_sans_ellipses) > -1:
before_display_url, after_display_url = expanded_url.split(display_url_sans_ellipses, 2)
- preceding_ellipsis = re.search(ur'\A…', display_url)
- following_ellipsis = re.search(ur'…\z', display_url)
+ preceding_ellipsis = re.search(r'\A…', display_url)
+ following_ellipsis = re.search(r'…\Z', display_url)
if preceding_ellipsis is not None:
preceding_ellipsis = preceding_ellipsis.group()
else:
@@ -344,7 +344,7 @@ def _link_url_with_entity(self, entity, options = {}):
# …
#
- return u"%s %s%s%s %s" % (preceding_ellipsis, invisible_tag_attrs, invisible_tag_attrs, self._html_escape(before_display_url), self._html_escape(display_url_sans_ellipses), invisible_tag_attrs, self._html_escape(after_display_url), invisible_tag_attrs, following_ellipsis)
+ return "%s %s%s%s %s" % (preceding_ellipsis, invisible_tag_attrs, invisible_tag_attrs, self._html_escape(before_display_url), self._html_escape(display_url_sans_ellipses), invisible_tag_attrs, self._html_escape(after_display_url), invisible_tag_attrs, following_ellipsis)
else:
return self._html_escape(display_url)
@@ -356,13 +356,13 @@ def _link_to_hashtag(self, entity, chars, options = {}):
if REGEXEN['rtl_chars'].search(hashtag):
hashtag_class += ' rtl'
- href = options.get('hashtag_url_transform', lambda ht: u'%s%s' % (options.get('hashtag_url_base'), ht))(hashtag)
+ href = options.get('hashtag_url_transform', lambda ht: '%s%s' % (options.get('hashtag_url_base'), ht))(hashtag)
html_attrs = {}
html_attrs.update(options.get('html_attrs', {}))
html_attrs = {
'class': hashtag_class,
- 'title': u'#%s' % hashtag,
+ 'title': '#%s' % hashtag,
}
link = self._link_to_text_with_symbol(entity, hashchar, hashtag, href, html_attrs, options)
@@ -372,11 +372,11 @@ def _link_to_cashtag(self, entity, chars, options = {}):
dollar = chars[entity['indices'][0]]
cashtag = entity['cashtag']
- href = options.get('cashtag_url_transform', lambda ct: u'%s%s' % (options.get('cashtag_url_base'), ct))(cashtag)
+ href = options.get('cashtag_url_transform', lambda ct: '%s%s' % (options.get('cashtag_url_base'), ct))(cashtag)
html_attrs = {
'class': options.get('cashtag_class'),
- 'title': u'$%s' % cashtag
+ 'title': '$%s' % cashtag
}
html_attrs.update(options.get('html_attrs', {}))
@@ -384,7 +384,7 @@ def _link_to_cashtag(self, entity, chars, options = {}):
return chars[:entity['indices'][0]] + link + chars[entity['indices'][1]:]
def _link_to_screen_name(self, entity, chars, options = {}):
- name = u'%s%s' % (entity['screen_name'], entity.get('list_slug') or '')
+ name = '%s%s' % (entity['screen_name'], entity.get('list_slug') or '')
chunk = options.get('link_text_transform', default_transform)(entity, name)
name = name.lower()
@@ -395,30 +395,30 @@ def _link_to_screen_name(self, entity, chars, options = {}):
del(html_attrs['title'])
if entity.get('list_slug') and not options.get('supress_lists'):
- href = options.get('list_url_transform', lambda sn: u'%s%s' % (options.get('list_url_base'), sn))(name)
+ href = options.get('list_url_transform', lambda sn: '%s%s' % (options.get('list_url_base'), sn))(name)
html_attrs['class'] = options.get('list_class')
else:
- href = options.get('username_url_transform', lambda sn: u'%s%s' % (options.get('username_url_base'), sn))(name)
+ href = options.get('username_url_transform', lambda sn: '%s%s' % (options.get('username_url_base'), sn))(name)
html_attrs['class'] = options.get('username_class')
link = self._link_to_text_with_symbol(entity, at, chunk, href, html_attrs, options)
return chars[:entity['indices'][0]] + link + chars[entity['indices'][1]:]
def _link_to_text_with_symbol(self, entity, symbol, text, href, attributes = {}, options = {}):
- tagged_symbol = u'<%s>%s%s>' % (options.get('symbol_tag'), symbol, options.get('symbol_tag')) if options.get('symbol_tag') else symbol
+ tagged_symbol = '<%s>%s%s>' % (options.get('symbol_tag'), symbol, options.get('symbol_tag')) if options.get('symbol_tag') else symbol
text = self._html_escape(text)
- tagged_text = u'<%s>%s%s>' % (options.get('text_with_symbol_tag'), text, options.get('text_with_symbol_tag')) if options.get('text_with_symbol_tag') else text
+ tagged_text = '<%s>%s%s>' % (options.get('text_with_symbol_tag'), text, options.get('text_with_symbol_tag')) if options.get('text_with_symbol_tag') else text
if options.get('username_include_symbol') or not REGEXEN['at_signs'].match(symbol):
- return u'%s' % self._link_to_text(entity, tagged_symbol + tagged_text, href, attributes, options)
+ return '%s' % self._link_to_text(entity, tagged_symbol + tagged_text, href, attributes, options)
else:
- return u'%s%s' % (tagged_symbol, self._link_to_text(entity, tagged_text, href, attributes, options))
+ return '%s%s' % (tagged_symbol, self._link_to_text(entity, tagged_text, href, attributes, options))
def _link_to_text(self, entity, text, href, attributes = {}, options = {}):
attributes['href'] = href
- if options.get('link_attributes_transform'):
- attributes = options.get('link_attributes_transform')(entity, attributes)
+ if options.get('link_attribute_transform'):
+ attributes = options.get('link_attribute_transform')(entity, attributes)
text = options.get('link_text_transform', default_transform)(entity, text)
- return u'%s' % (self._tag_attrs(attributes), text)
+ return '%s' % (self._tag_attrs(attributes), text)
def _tag_attrs(self, attributes = {}):
attrs = []
@@ -428,7 +428,7 @@ def _tag_attrs(self, attributes = {}):
attrs.append(key)
continue
if type(value) == list:
- value = u' '.join(value)
- attrs.append(u'%s="%s"' % (self._html_escape(key), self._html_escape(value)))
+ value = ' '.join(value)
+ attrs.append('%s="%s"' % (self._html_escape(key), self._html_escape(value)))
- return u' '.join(attrs)
\ No newline at end of file
+ return ' '.join(attrs)
diff --git a/twitter_text/highlighter.py b/twitter_text/highlighter.py
index ec128ca..f352b25 100644
--- a/twitter_text/highlighter.py
+++ b/twitter_text/highlighter.py
@@ -1,7 +1,7 @@
# encoding=utf-8
-
+from __future__ import unicode_literals
import re
-from HTMLParser import HTMLParser
+from six.moves import html_parser
from twitter_text.regex import UNICODE_SPACES
from twitter_text.unicode import force_unicode
@@ -9,7 +9,7 @@
DEFAULT_HIGHLIGHT_TAG = 'em'
# from http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
-class MLStripper(HTMLParser):
+class MLStripper(html_parser.HTMLParser):
def __init__(self):
self.reset()
self.fed = []
@@ -34,14 +34,14 @@ def hit_highlight(self, hits = [], **kwargs):
if not hits and kwargs.get('query'):
stripped_text = strip_tags(self.text)
- for match in re.finditer(ur'%s' % kwargs.get('query'), stripped_text):
+ for match in re.finditer(r'%s' % kwargs.get('query'), stripped_text):
hits.append(match.span())
if hits and not type(hits) == list:
raise Exception('The syntax for the hit_highlight method has changed. You must pass in a list of lists containing the indices of the strings you want to match.')
tag_name = kwargs.get('tag', DEFAULT_HIGHLIGHT_TAG)
- tags = [u'<%s>' % tag_name, u'%s>' % tag_name]
+ tags = ['<%s>' % tag_name, '%s>' % tag_name]
text = self.text
chunks = re.split(r'[<>]', text)
@@ -58,7 +58,7 @@ def hit_highlight(self, hits = [], **kwargs):
if index % 2:
# we're inside a
continue
- chunk_start = len(u''.join(text_chunks[0:index / 2]))
+ chunk_start = len(''.join(text_chunks[0:index / 2]))
chunk_end = chunk_start + len(chunk)
if hit_start >= chunk_start and hit_start < chunk_end:
chunk = chunk[:hit_start - chunk_start] + tags[0] + chunk[hit_start - chunk_start:]
@@ -76,8 +76,8 @@ def hit_highlight(self, hits = [], **kwargs):
for index, chunk in enumerate(chunks):
if index % 2:
# we're inside a
- result.append(u'<%s>' % chunk)
+ result.append('<%s>' % chunk)
else:
result.append(chunk)
- self.text = u''.join(result)
- return self.text
\ No newline at end of file
+ self.text = ''.join(result)
+ return self.text
diff --git a/twitter_text/regex.py b/twitter_text/regex.py
index c136f80..2a2d9dc 100644
--- a/twitter_text/regex.py
+++ b/twitter_text/regex.py
@@ -4,36 +4,40 @@
# list is frozen at load time to ensure immutability. These reular expressions are
# used throughout the Twitter classes. Special care has been taken to make
# sure these reular expressions work with Tweets in all languages.
-import re, string
+from __future__ import unicode_literals
+import re
+import six
+from six.moves import reduce
+
+REGEXEN = {} # :nodoc:
-REGEXEN = {} # :nodoc:
def regex_range(start, end = None):
if end:
- return u'%s-%s' % (unichr(start), unichr(end))
+ return '%s-%s' % (six.unichr(start), six.unichr(end))
else:
- return u'%s' % unichr(start)
+ return '%s' % six.unichr(start)
# Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand
# to access both the list of characters and a pattern suitible for use with String#split
# Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE
UNICODE_SPACES = []
for space in reduce(lambda x,y: x + y if type(y) == list else x + [y], [
- range(0x0009, 0x000D), # White_Space # Cc [5] ..
+ list(range(0x0009, 0x000D)), # White_Space # Cc [5] ..
0x0020, # White_Space # Zs SPACE
0x0085, # White_Space # Cc
0x00A0, # White_Space # Zs NO-BREAK SPACE
0x1680, # White_Space # Zs OGHAM SPACE MARK
0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
- range(0x2000, 0x200A), # White_Space # Zs [11] EN QUAD..HAIR SPACE
+ list(range(0x2000, 0x200A)), # White_Space # Zs [11] EN QUAD..HAIR SPACE
0x2028, # White_Space # Zl LINE SEPARATOR
0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
]):
- UNICODE_SPACES.append(unichr(space))
-REGEXEN['spaces'] = re.compile(ur''.join(UNICODE_SPACES))
+ UNICODE_SPACES.append(six.unichr(space))
+REGEXEN['spaces'] = re.compile(r''.join(UNICODE_SPACES))
# Characters not allowed in Tweets
INVALID_CHARACTERS = [
@@ -41,9 +45,9 @@ def regex_range(start, end = None):
0xFFFF, # Special
0x202A, 0x202B, 0x202C, 0x202D, 0x202E, # Directional change
]
-REGEXEN['invalid_control_characters'] = [unichr(x) for x in INVALID_CHARACTERS]
+REGEXEN['invalid_control_characters'] = [six.unichr(x) for x in INVALID_CHARACTERS]
-REGEXEN['list_name'] = re.compile(ur'^[a-zA-Z][a-zA-Z0-9_\-\u0080-\u00ff]{0,24}$')
+REGEXEN['list_name'] = re.compile(r'^[a-zA-Z][a-zA-Z0-9_\-\u0080-\u00ff]{0,24}$')
# Latin accented characters
# Excludes 0xd7 from the range (the multiplication sign, confusable with "x").
@@ -67,14 +71,14 @@ def regex_range(start, end = None):
regex_range(0x0300, 0x036f),
regex_range(0x1e00, 0x1eff),
]
-REGEXEN['latin_accents'] = re.compile(ur''.join(LATIN_ACCENTS), re.IGNORECASE | re.UNICODE)
-LATIN_ACCENTS = u''.join(LATIN_ACCENTS)
+REGEXEN['latin_accents'] = re.compile(r''.join(LATIN_ACCENTS), re.IGNORECASE | re.UNICODE)
+LATIN_ACCENTS = ''.join(LATIN_ACCENTS)
RTL_CHARACTERS = ''.join([
- regex_range(0x0600,0x06FF),
- regex_range(0x0750,0x077F),
- regex_range(0x0590,0x05FF),
- regex_range(0xFE70,0xFEFF)
+ regex_range(0x0600, 0x06FF),
+ regex_range(0x0750, 0x077F),
+ regex_range(0x0590, 0x05FF),
+ regex_range(0xFE70, 0xFEFF)
])
NON_LATIN_HASHTAG_CHARS = ''.join([
@@ -147,69 +151,69 @@ def regex_range(start, end = None):
# this is a narrow python build so these extended Kanji characters won't work
pass
-PUNCTUATION_CHARS = ur'!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~'
-SPACE_CHARS = ur" \t\n\x0B\f\r"
-CTRL_CHARS = ur"\x00-\x1F\x7F"
+PUNCTUATION_CHARS = r'!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~'
+SPACE_CHARS = r" \t\n\x0B\f\r"
+CTRL_CHARS = r"\x00-\x1F\x7F"
# A hashtag must contain latin characters, numbers and underscores, but not all numbers.
-HASHTAG_ALPHA = ur'[a-z_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS)
-HASHTAG_ALPHANUMERIC = ur'[a-z0-9_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS)
-HASHTAG_BOUNDARY = ur'\A|\z|\[|[^&a-z0-9_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS)
+HASHTAG_ALPHA = r'[a-z_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS)
+HASHTAG_ALPHANUMERIC = r'[a-z0-9_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS)
+HASHTAG_BOUNDARY = r'\A|\Z|\[|[^&a-z0-9_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS)
-HASHTAG = re.compile(ur'(%s)(#|#)(%s*%s%s*)' % (HASHTAG_BOUNDARY, HASHTAG_ALPHANUMERIC, HASHTAG_ALPHA, HASHTAG_ALPHANUMERIC), re.IGNORECASE)
+HASHTAG = re.compile(r'(%s)(#|#)(%s*%s%s*)' % (HASHTAG_BOUNDARY, HASHTAG_ALPHANUMERIC, HASHTAG_ALPHA, HASHTAG_ALPHANUMERIC), re.IGNORECASE)
REGEXEN['valid_hashtag'] = HASHTAG
-REGEXEN['end_hashtag_match'] = re.compile(ur'\A(?:[##]|:\/\/)', re.IGNORECASE | re.UNICODE)
-REGEXEN['numeric_only'] = re.compile(ur'^[\d]+$')
+REGEXEN['end_hashtag_match'] = re.compile(r'\A(?:[##]|://)', re.IGNORECASE | re.UNICODE)
+REGEXEN['numeric_only'] = re.compile(r'^[\d]+$')
-REGEXEN['valid_mention_preceding_chars'] = re.compile(r'(?:[^a-zA-Z0-9_!#\$%&*@@]|^|RT:?)')
-REGEXEN['at_signs'] = re.compile(ur'[@@]')
+REGEXEN['valid_mention_preceding_chars'] = re.compile(r'(?:[^a-zA-Z0-9_!#$%&*@@]|^|RT:?)')
+REGEXEN['at_signs'] = re.compile(r'[@@]')
REGEXEN['valid_mention_or_list'] = re.compile(
- ur'(%s)' % REGEXEN['valid_mention_preceding_chars'].pattern.decode('utf-8') + # preceding character
- ur'(%s)' % REGEXEN['at_signs'].pattern + # at mark
- ur'([a-zA-Z0-9_]{1,20})' + # screen name
- ur'(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?' # list (optional)
+ r'(%s)' % REGEXEN['valid_mention_preceding_chars'].pattern + # preceding character
+ r'(%s)' % REGEXEN['at_signs'].pattern + # at mark
+ r'([a-zA-Z0-9_]{1,20})' + # screen name
+ r'(/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?' # list (optional)
)
-REGEXEN['valid_reply'] = re.compile(ur'^(?:[%s])*%s([a-zA-Z0-9_]{1,20})' % (REGEXEN['spaces'].pattern, REGEXEN['at_signs'].pattern), re.IGNORECASE | re.UNICODE)
+REGEXEN['valid_reply'] = re.compile(r'^(?:[%s])*%s([a-zA-Z0-9_]{1,20})' % (REGEXEN['spaces'].pattern, REGEXEN['at_signs'].pattern), re.IGNORECASE | re.UNICODE)
# Used in Extractor for final filtering
-REGEXEN['end_mention_match'] = re.compile(ur'\A(?:%s|[%s]|:\/\/)' % (REGEXEN['at_signs'].pattern, REGEXEN['latin_accents'].pattern), re.IGNORECASE | re.UNICODE)
+REGEXEN['end_mention_match'] = re.compile(r'\A(?:%s|[%s]|://)' % (REGEXEN['at_signs'].pattern, REGEXEN['latin_accents'].pattern), re.IGNORECASE | re.UNICODE)
# URL related hash regex collection
-REGEXEN['valid_url_preceding_chars'] = re.compile(ur'(?:[^A-Z0-9@@$##%s]|^)' % ur''.join(REGEXEN['invalid_control_characters']), re.IGNORECASE | re.UNICODE)
-REGEXEN['invalid_url_without_protocol_preceding_chars'] = re.compile(ur'[-_.\/]$')
-DOMAIN_VALID_CHARS = ur'[^%s%s%s%s%s]' % (PUNCTUATION_CHARS, SPACE_CHARS, CTRL_CHARS, ur''.join(REGEXEN['invalid_control_characters']), ur''.join(UNICODE_SPACES))
-REGEXEN['valid_subdomain'] = re.compile(ur'(?:(?:%s(?:[_-]|%s)*)?%s\.)' % (DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS), re.IGNORECASE | re.UNICODE)
-REGEXEN['valid_domain_name'] = re.compile(ur'(?:(?:%s(?:[-]|%s)*)?%s\.)' % (DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS), re.IGNORECASE | re.UNICODE)
-REGEXEN['valid_gTLD'] = re.compile(ur'(?:(?:academy|actor|aero|agency|arpa|asia|bar|bargains|berlin|best|bid|bike|biz|blue|boutique|build|builders|buzz|cab|camera|camp|cards|careers|cat|catering|center|ceo|cheap|christmas|cleaning|clothing|club|codes|coffee|com|community|company|computer|construction|contractors|cool|coop|cruises|dance|dating|democrat|diamonds|directory|domains|edu|education|email|enterprises|equipment|estate|events|expert|exposed|farm|fish|flights|florist|foundation|futbol|gallery|gift|glass|gov|graphics|guitars|guru|holdings|holiday|house|immobilien|industries|info|institute|int|international|jobs|kaufen|kim|kitchen|kiwi|koeln|kred|land|lighting|limo|link|luxury|management|mango|marketing|menu|mil|mobi|moda|monash|museum|nagoya|name|net|neustar|ninja|okinawa|onl|org|partners|parts|photo|photography|photos|pics|pink|plumbing|post|pro|productions|properties|pub|qpon|recipes|red|rentals|repair|report|reviews|rich|ruhr|sexy|shiksha|shoes|singles|social|solar|solutions|supplies|supply|support|systems|tattoo|technology|tel|tienda|tips|today|tokyo|tools|training|travel|uno|vacations|ventures|viajes|villas|vision|vote|voting|voto|voyage|wang|watch|wed|wien|wiki|works|xxx|xyz|zone|дети|онлайн|орг|сайт|بازار|شبكة|みんな|中信|中文网|公司|公>益|在线|我爱你|政务|游戏|移动|网络|集团|삼성)(?=[^0-9a-z]|$))', re.IGNORECASE | re.UNICODE)
-REGEXEN['valid_ccTLD'] = re.compile(ur'(?:(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bl|bm|bn|bo|bq|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cw|cx|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mf|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw|мон|рф|срб|укр|қаз|الاردن|الجزائر|السعودية|المغرب|امارات|ایران|بھارت|تونس|سودان|سورية|عمان|فلسطين|قطر|مصر|مليسيا|پاکستان|भारत|বাংলা|ভারত|ਭਾਰਤ|ભારત|இந்தியா|இலங்கை|சிங்கப்பூர்|భారత్|ලංකා|ไทย|გე|中国|中加坡|湾|台灣|新香港|한국)(?=[^0-9a-z]|$))', re.IGNORECASE | re.UNICODE)
-REGEXEN['valid_punycode'] = re.compile(ur'(?:xn--[0-9a-z]+)', re.IGNORECASE | re.UNICODE)
+REGEXEN['valid_url_preceding_chars'] = re.compile(r'(?:[^A-Z0-9@@$##%s]|^)' % r''.join(REGEXEN['invalid_control_characters']), re.IGNORECASE | re.UNICODE)
+REGEXEN['invalid_url_without_protocol_preceding_chars'] = re.compile(r'[-_./]$')
+DOMAIN_VALID_CHARS = r'[^%s%s%s%s%s]' % (PUNCTUATION_CHARS, SPACE_CHARS, CTRL_CHARS, r''.join(REGEXEN['invalid_control_characters']), r''.join(UNICODE_SPACES))
+REGEXEN['valid_subdomain'] = re.compile(r'(?:(?:%s(?:[_-]|%s)*)?%s\.)' % (DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS), re.IGNORECASE | re.UNICODE)
+REGEXEN['valid_domain_name'] = re.compile(r'(?:(?:%s(?:[-]|%s)*)?%s\.)' % (DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS), re.IGNORECASE | re.UNICODE)
+REGEXEN['valid_gTLD'] = re.compile(r'(?:(?:academy|actor|aero|agency|arpa|asia|bar|bargains|berlin|best|bid|bike|biz|blue|boutique|build|builders|buzz|cab|camera|camp|cards|careers|cat|catering|center|ceo|cheap|christmas|cleaning|clothing|club|codes|coffee|com|community|company|computer|construction|contractors|cool|coop|cruises|dance|dating|democrat|diamonds|directory|domains|edu|education|email|enterprises|equipment|estate|events|expert|exposed|farm|fish|flights|florist|foundation|futbol|gallery|gift|glass|gov|graphics|guitars|guru|holdings|holiday|house|immobilien|industries|info|institute|int|international|jobs|kaufen|kim|kitchen|kiwi|koeln|kred|land|lighting|limo|link|luxury|management|mango|marketing|menu|mil|mobi|moda|monash|museum|nagoya|name|net|neustar|ninja|okinawa|onl|org|partners|parts|photo|photography|photos|pics|pink|plumbing|post|pro|productions|properties|pub|qpon|recipes|red|rentals|repair|report|reviews|rich|ruhr|sexy|shiksha|shoes|singles|social|solar|solutions|supplies|supply|support|systems|tattoo|technology|tel|tienda|tips|today|tokyo|tools|training|travel|uno|vacations|ventures|viajes|villas|vision|vote|voting|voto|voyage|wang|watch|wed|wien|wiki|works|xxx|xyz|zone|дети|онлайн|орг|сайт|بازار|شبكة|みんな|中信|中文网|公司|公>益|在线|我爱你|政务|游戏|移动|网络|集团|삼성)(?=[^0-9a-z]|$))', re.IGNORECASE | re.UNICODE)
+REGEXEN['valid_ccTLD'] = re.compile(r'(?:(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bl|bm|bn|bo|bq|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cw|cx|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mf|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw|мон|рф|срб|укр|қаз|الاردن|الجزائر|السعودية|المغرب|امارات|ایران|بھارت|تونس|سودان|سورية|عمان|فلسطين|قطر|مصر|مليسيا|پاکستان|भारत|বাংলা|ভারত|ਭਾਰਤ|ભારત|இந்தியா|இலங்கை|சிங்கப்பூர்|భారత్|ලංකා|ไทย|გე|中国|中加坡|湾|台灣|新香港|한국)(?=[^0-9a-z]|$))', re.IGNORECASE | re.UNICODE)
+REGEXEN['valid_punycode'] = re.compile(r'(?:xn--[0-9a-z]+)', re.IGNORECASE | re.UNICODE)
-REGEXEN['valid_domain'] = re.compile(ur'(?:%s*%s(?:%s|%s|%s))' % (REGEXEN['valid_subdomain'].pattern, REGEXEN['valid_domain_name'].pattern, REGEXEN['valid_gTLD'].pattern, REGEXEN['valid_ccTLD'].pattern, REGEXEN['valid_punycode'].pattern), re.IGNORECASE | re.UNICODE)
+REGEXEN['valid_domain'] = re.compile(r'(?:%s*%s(?:%s|%s|%s))' % (REGEXEN['valid_subdomain'].pattern, REGEXEN['valid_domain_name'].pattern, REGEXEN['valid_gTLD'].pattern, REGEXEN['valid_ccTLD'].pattern, REGEXEN['valid_punycode'].pattern), re.IGNORECASE | re.UNICODE)
# This is used in Extractor
-REGEXEN['valid_ascii_domain'] = re.compile(ur'(?:(?:[A-Za-z0-9\-_]|[%s])+\.)+(?:%s|%s|%s)' % (REGEXEN['latin_accents'].pattern, REGEXEN['valid_gTLD'].pattern, REGEXEN['valid_ccTLD'].pattern, REGEXEN['valid_punycode'].pattern), re.IGNORECASE | re.UNICODE)
+REGEXEN['valid_ascii_domain'] = re.compile(r'(?:(?:[A-Za-z0-9\-_]|[%s])+\.)+(?:%s|%s|%s)' % (REGEXEN['latin_accents'].pattern, REGEXEN['valid_gTLD'].pattern, REGEXEN['valid_ccTLD'].pattern, REGEXEN['valid_punycode'].pattern), re.IGNORECASE | re.UNICODE)
# This is used in Extractor for stricter t.co URL extraction
-REGEXEN['valid_tco_url'] = re.compile(ur'^https?:\/\/t\.co\/[a-z0-9]+', re.IGNORECASE | re.UNICODE)
+REGEXEN['valid_tco_url'] = re.compile(r'^https?://t\.co/[a-z0-9]+', re.IGNORECASE | re.UNICODE)
# This is used in Extractor to filter out unwanted URLs.
-REGEXEN['invalid_short_domain'] = re.compile(ur'\A%s%s\Z' % (REGEXEN['valid_domain_name'].pattern, REGEXEN['valid_ccTLD'].pattern), re.IGNORECASE | re.UNICODE)
+REGEXEN['invalid_short_domain'] = re.compile(r'\A%s%s\Z' % (REGEXEN['valid_domain_name'].pattern, REGEXEN['valid_ccTLD'].pattern), re.IGNORECASE | re.UNICODE)
-REGEXEN['valid_port_number'] = re.compile(ur'[0-9]+')
+REGEXEN['valid_port_number'] = re.compile(r'[0-9]+')
-REGEXEN['valid_general_url_path_chars'] = re.compile(ur"[a-z0-9!\*';:=\+\,\.\$\/%%#\[\]\-_~&|@%s]" % LATIN_ACCENTS, re.IGNORECASE | re.UNICODE)
+REGEXEN['valid_general_url_path_chars'] = re.compile(r"[a-z0-9!*';:=+,.$/%%#\[\]\-_~&|@%s]" % LATIN_ACCENTS, re.IGNORECASE | re.UNICODE)
# Allow URL paths to contain balanced parens
# 1. Used in Wikipedia URLs like /Primer_(film)
# 2. Used in IIS sessions like /S(dfd346)/
-REGEXEN['valid_url_balanced_parens'] = re.compile(ur'\(%s+\)' % REGEXEN['valid_general_url_path_chars'].pattern, re.IGNORECASE | re.UNICODE)
+REGEXEN['valid_url_balanced_parens'] = re.compile(r'\(%s+\)' % REGEXEN['valid_general_url_path_chars'].pattern, re.IGNORECASE | re.UNICODE)
# Valid end-of-path chracters (so /foo. does not gobble the period).
# 1. Allow = for empty URL parameters and other URL-join artifacts
-REGEXEN['valid_url_path_ending_chars'] = re.compile(ur'[a-z0-9=_#\/\+\-%s]|(?:%s)' % (LATIN_ACCENTS, REGEXEN['valid_url_balanced_parens'].pattern), re.IGNORECASE | re.UNICODE)
-REGEXEN['valid_url_path'] = re.compile(ur'(?:(?:%s*(?:%s %s*)*%s)|(?:%s+\/))' % (REGEXEN['valid_general_url_path_chars'].pattern, REGEXEN['valid_url_balanced_parens'].pattern, REGEXEN['valid_general_url_path_chars'].pattern, REGEXEN['valid_url_path_ending_chars'].pattern, REGEXEN['valid_general_url_path_chars'].pattern), re.IGNORECASE | re.UNICODE)
+REGEXEN['valid_url_path_ending_chars'] = re.compile(r'[a-z0-9=_#/+\-%s]|(?:%s)' % (LATIN_ACCENTS, REGEXEN['valid_url_balanced_parens'].pattern), re.IGNORECASE | re.UNICODE)
+REGEXEN['valid_url_path'] = re.compile(r'(?:(?:%s*(?:%s %s*)*%s)|(?:%s+/))' % (REGEXEN['valid_general_url_path_chars'].pattern, REGEXEN['valid_url_balanced_parens'].pattern, REGEXEN['valid_general_url_path_chars'].pattern, REGEXEN['valid_url_path_ending_chars'].pattern, REGEXEN['valid_general_url_path_chars'].pattern), re.IGNORECASE | re.UNICODE)
-REGEXEN['valid_url_query_chars'] = re.compile(ur"[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]", re.IGNORECASE | re.UNICODE)
-REGEXEN['valid_url_query_ending_chars'] = re.compile(ur'[a-z0-9_&=#\/]', re.IGNORECASE | re.UNICODE)
-REGEXEN['valid_url'] = re.compile(ur'((%s)((https?:\/\/)?(%s)(?::(%s))?(/%s*)?(\?%s*%s)?))' % (
+REGEXEN['valid_url_query_chars'] = re.compile(r"[a-z0-9!?*'();:&=+$/%#\[\]\-_.,~|@]", re.IGNORECASE | re.UNICODE)
+REGEXEN['valid_url_query_ending_chars'] = re.compile(r'[a-z0-9_&=#/]', re.IGNORECASE | re.UNICODE)
+REGEXEN['valid_url'] = re.compile(r'((%s)((https?://)?(%s)(?::(%s))?(/%s*)?(\?%s*%s)?))' % (
REGEXEN['valid_url_preceding_chars'].pattern,
REGEXEN['valid_domain'].pattern,
REGEXEN['valid_port_number'].pattern,
@@ -227,54 +231,54 @@ def regex_range(start, end = None):
# $7 URL Path and anchor
# $8 Query String
-REGEXEN['cashtag'] = re.compile(ur'[a-z]{1,6}(?:[._][a-z]{1,2})?', re.IGNORECASE)
-REGEXEN['valid_cashtag'] = re.compile(ur'(^|[%s])(\$|$|﹩)(%s)(?=$|\s|[%s])' % (REGEXEN['spaces'].pattern, REGEXEN['cashtag'].pattern, PUNCTUATION_CHARS), re.IGNORECASE)
+REGEXEN['cashtag'] = re.compile(r'[a-z]{1,6}(?:[._][a-z]{1,2})?', re.IGNORECASE)
+REGEXEN['valid_cashtag'] = re.compile(r'(^|[%s])(\$|$|﹩)(%s)(?=$|\s|[%s])' % (REGEXEN['spaces'].pattern, REGEXEN['cashtag'].pattern, PUNCTUATION_CHARS), re.IGNORECASE)
# These URL validation pattern strings are based on the ABNF from RFC 3986
-REGEXEN['validate_url_unreserved'] = re.compile(ur'[a-z0-9\-._~]', re.IGNORECASE | re.UNICODE)
-REGEXEN['validate_url_pct_encoded'] = re.compile(ur'(?:%[0-9a-f]{2})', re.IGNORECASE | re.UNICODE)
-REGEXEN['validate_url_sub_delims'] = re.compile(ur"[!$&'()*+,;=]", re.IGNORECASE | re.UNICODE)
-REGEXEN['validate_url_pchar'] = re.compile(ur'(?:%s|%s|%s|[:\|@])' % (REGEXEN['validate_url_unreserved'].pattern, REGEXEN['validate_url_pct_encoded'].pattern, REGEXEN['validate_url_sub_delims'].pattern), re.IGNORECASE | re.UNICODE)
+REGEXEN['validate_url_unreserved'] = re.compile(r'[a-z0-9\-._~]', re.IGNORECASE | re.UNICODE)
+REGEXEN['validate_url_pct_encoded'] = re.compile(r'(?:%[0-9a-f]{2})', re.IGNORECASE | re.UNICODE)
+REGEXEN['validate_url_sub_delims'] = re.compile(r"[!$&'()*+,;=]", re.IGNORECASE | re.UNICODE)
+REGEXEN['validate_url_pchar'] = re.compile(r'(?:%s|%s|%s|[:|@])' % (REGEXEN['validate_url_unreserved'].pattern, REGEXEN['validate_url_pct_encoded'].pattern, REGEXEN['validate_url_sub_delims'].pattern), re.IGNORECASE | re.UNICODE)
-REGEXEN['validate_url_scheme'] = re.compile(ur'(?:[a-z][a-z0-9+\-.]*)', re.IGNORECASE | re.UNICODE)
-REGEXEN['validate_url_userinfo'] = re.compile(ur'(?:%s|%s|%s|:)*' % (REGEXEN['validate_url_unreserved'].pattern, REGEXEN['validate_url_pct_encoded'].pattern, REGEXEN['validate_url_sub_delims'].pattern), re.IGNORECASE | re.UNICODE)
+REGEXEN['validate_url_scheme'] = re.compile(r'(?:[a-z][a-z0-9+\-.]*)', re.IGNORECASE | re.UNICODE)
+REGEXEN['validate_url_userinfo'] = re.compile(r'(?:%s|%s|%s|:)*' % (REGEXEN['validate_url_unreserved'].pattern, REGEXEN['validate_url_pct_encoded'].pattern, REGEXEN['validate_url_sub_delims'].pattern), re.IGNORECASE | re.UNICODE)
-REGEXEN['validate_url_dec_octet'] = re.compile(ur'(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))', re.IGNORECASE | re.UNICODE)
-REGEXEN['validate_url_ipv4'] = re.compile(ur'(?:%s(?:\.%s){3})' % (REGEXEN['validate_url_dec_octet'].pattern, REGEXEN['validate_url_dec_octet'].pattern), re.IGNORECASE | re.UNICODE)
+REGEXEN['validate_url_dec_octet'] = re.compile(r'(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))', re.IGNORECASE | re.UNICODE)
+REGEXEN['validate_url_ipv4'] = re.compile(r'(?:%s(?:\.%s){3})' % (REGEXEN['validate_url_dec_octet'].pattern, REGEXEN['validate_url_dec_octet'].pattern), re.IGNORECASE | re.UNICODE)
# Punting on real IPv6 validation for now
-REGEXEN['validate_url_ipv6'] = re.compile(ur'(?:\[[a-f0-9:\.]+\])', re.IGNORECASE | re.UNICODE)
+REGEXEN['validate_url_ipv6'] = re.compile(r'(?:\[[a-f0-9:.]+\])', re.IGNORECASE | re.UNICODE)
# Also punting on IPvFuture for now
-REGEXEN['validate_url_ip'] = re.compile(ur'(?:%s|%s)' % (REGEXEN['validate_url_ipv4'].pattern, REGEXEN['validate_url_ipv6'].pattern), re.IGNORECASE | re.UNICODE)
+REGEXEN['validate_url_ip'] = re.compile(r'(?:%s|%s)' % (REGEXEN['validate_url_ipv4'].pattern, REGEXEN['validate_url_ipv6'].pattern), re.IGNORECASE | re.UNICODE)
# This is more strict than the rfc specifies
-REGEXEN['validate_url_subdomain_segment'] = re.compile(ur'(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)', re.IGNORECASE | re.UNICODE)
-REGEXEN['validate_url_domain_segment'] = re.compile(ur'(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)', re.IGNORECASE | re.UNICODE)
-REGEXEN['validate_url_domain_tld'] = re.compile(ur'(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)', re.IGNORECASE | re.UNICODE)
-REGEXEN['validate_url_domain'] = re.compile(ur'(?:(?:%s\.)*(?:%s\.)%s)' % (REGEXEN['validate_url_subdomain_segment'].pattern, REGEXEN['validate_url_domain_segment'].pattern, REGEXEN['validate_url_domain_tld'].pattern), re.IGNORECASE | re.UNICODE)
+REGEXEN['validate_url_subdomain_segment'] = re.compile(r'(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)', re.IGNORECASE | re.UNICODE)
+REGEXEN['validate_url_domain_segment'] = re.compile(r'(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)', re.IGNORECASE | re.UNICODE)
+REGEXEN['validate_url_domain_tld'] = re.compile(r'(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)', re.IGNORECASE | re.UNICODE)
+REGEXEN['validate_url_domain'] = re.compile(r'(?:(?:%s\.)*(?:%s\.)%s)' % (REGEXEN['validate_url_subdomain_segment'].pattern, REGEXEN['validate_url_domain_segment'].pattern, REGEXEN['validate_url_domain_tld'].pattern), re.IGNORECASE | re.UNICODE)
-REGEXEN['validate_url_host'] = re.compile(ur'(?:%s|%s)' % (REGEXEN['validate_url_ip'].pattern, REGEXEN['validate_url_domain'].pattern), re.IGNORECASE | re.UNICODE)
+REGEXEN['validate_url_host'] = re.compile(r'(?:%s|%s)' % (REGEXEN['validate_url_ip'].pattern, REGEXEN['validate_url_domain'].pattern), re.IGNORECASE | re.UNICODE)
# Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences
-REGEXEN['validate_url_unicode_subdomain_segment'] = re.compile(ur'(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9_\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)', re.IGNORECASE | re.UNICODE)
-REGEXEN['validate_url_unicode_domain_segment'] = re.compile(ur'(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)', re.IGNORECASE | re.UNICODE)
-REGEXEN['validate_url_unicode_domain_tld'] = re.compile(ur'(?:(?:[a-z]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)', re.IGNORECASE | re.UNICODE)
-REGEXEN['validate_url_unicode_domain'] = re.compile(ur'(?:(?:%s\.)*(?:%s\.)%s)' % (REGEXEN['validate_url_unicode_subdomain_segment'].pattern, REGEXEN['validate_url_unicode_domain_segment'].pattern, REGEXEN['validate_url_unicode_domain_tld'].pattern), re.IGNORECASE | re.UNICODE)
+REGEXEN['validate_url_unicode_subdomain_segment'] = re.compile(r'(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9_\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)', re.IGNORECASE | re.UNICODE)
+REGEXEN['validate_url_unicode_domain_segment'] = re.compile(r'(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)', re.IGNORECASE | re.UNICODE)
+REGEXEN['validate_url_unicode_domain_tld'] = re.compile(r'(?:(?:[a-z]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)', re.IGNORECASE | re.UNICODE)
+REGEXEN['validate_url_unicode_domain'] = re.compile(r'(?:(?:%s\.)*(?:%s\.)%s)' % (REGEXEN['validate_url_unicode_subdomain_segment'].pattern, REGEXEN['validate_url_unicode_domain_segment'].pattern, REGEXEN['validate_url_unicode_domain_tld'].pattern), re.IGNORECASE | re.UNICODE)
-REGEXEN['validate_url_unicode_host'] = re.compile(ur'(?:%s|%s)' % (REGEXEN['validate_url_ip'].pattern, REGEXEN['validate_url_unicode_domain'].pattern), re.IGNORECASE | re.UNICODE)
+REGEXEN['validate_url_unicode_host'] = re.compile(r'(?:%s|%s)' % (REGEXEN['validate_url_ip'].pattern, REGEXEN['validate_url_unicode_domain'].pattern), re.IGNORECASE | re.UNICODE)
-REGEXEN['validate_url_port'] = re.compile(ur'[0-9]{1,5}')
+REGEXEN['validate_url_port'] = re.compile(r'[0-9]{1,5}')
-REGEXEN['validate_url_unicode_authority'] = re.compile(ur'(?:(%s)@)?(%s)(?::(%s))?' % (REGEXEN['validate_url_userinfo'].pattern, REGEXEN['validate_url_unicode_host'].pattern, REGEXEN['validate_url_port'].pattern), re.IGNORECASE | re.UNICODE)
+REGEXEN['validate_url_unicode_authority'] = re.compile(r'(?:(%s)@)?(%s)(?::(%s))?' % (REGEXEN['validate_url_userinfo'].pattern, REGEXEN['validate_url_unicode_host'].pattern, REGEXEN['validate_url_port'].pattern), re.IGNORECASE | re.UNICODE)
-REGEXEN['validate_url_authority'] = re.compile(ur'(?:(%s)@)?(%s)(?::(%s))?' % (REGEXEN['validate_url_userinfo'].pattern, REGEXEN['validate_url_host'].pattern, REGEXEN['validate_url_port'].pattern), re.IGNORECASE | re.UNICODE)
+REGEXEN['validate_url_authority'] = re.compile(r'(?:(%s)@)?(%s)(?::(%s))?' % (REGEXEN['validate_url_userinfo'].pattern, REGEXEN['validate_url_host'].pattern, REGEXEN['validate_url_port'].pattern), re.IGNORECASE | re.UNICODE)
-REGEXEN['validate_url_path'] = re.compile(ur'(/%s*)*' % REGEXEN['validate_url_pchar'].pattern, re.IGNORECASE | re.UNICODE)
-REGEXEN['validate_url_query'] = re.compile(ur'(%s|/|\?)*' % REGEXEN['validate_url_pchar'].pattern, re.IGNORECASE | re.UNICODE)
-REGEXEN['validate_url_fragment'] = re.compile(ur'(%s|/|\?)*' % REGEXEN['validate_url_pchar'].pattern, re.IGNORECASE | re.UNICODE)
+REGEXEN['validate_url_path'] = re.compile(r'(/%s*)*' % REGEXEN['validate_url_pchar'].pattern, re.IGNORECASE | re.UNICODE)
+REGEXEN['validate_url_query'] = re.compile(r'(%s|/|\?)*' % REGEXEN['validate_url_pchar'].pattern, re.IGNORECASE | re.UNICODE)
+REGEXEN['validate_url_fragment'] = re.compile(r'(%s|/|\?)*' % REGEXEN['validate_url_pchar'].pattern, re.IGNORECASE | re.UNICODE)
# Modified version of RFC 3986 Appendix B
-REGEXEN['validate_url_unencoded'] = re.compile(ur'\A(?:([^:/?#]+)://)?([^/?#]*)([^?#]*)(?:\?([^#]*))?(?:\#(.*))?\Z', re.IGNORECASE | re.UNICODE)
+REGEXEN['validate_url_unencoded'] = re.compile(r'\A(?:([^:/?#]+)://)?([^/?#]*)([^?#]*)(?:\?([^#]*))?(?:#(.*))?\Z', re.IGNORECASE | re.UNICODE)
-REGEXEN['rtl_chars'] = re.compile(ur'[%s]' % RTL_CHARACTERS, re.IGNORECASE | re.UNICODE)
+REGEXEN['rtl_chars'] = re.compile(r'[%s]' % RTL_CHARACTERS, re.IGNORECASE | re.UNICODE)
diff --git a/twitter_text/unicode.py b/twitter_text/unicode.py
index 4e17267..e62eda5 100644
--- a/twitter_text/unicode.py
+++ b/twitter_text/unicode.py
@@ -1,6 +1,9 @@
-import types, datetime
+import datetime
from decimal import Decimal
+import six
+
+
# borrowed from django.utils.encoding
class TwitterTextUnicodeDecodeError(UnicodeDecodeError):
def __init__(self, obj, *args):
@@ -12,52 +15,49 @@ def __str__(self):
return '%s. You passed in %r (%s)' % (original, self.obj,
type(self.obj))
+
def is_protected_type(obj):
"""Determine if the object instance is of a protected type.
Objects of protected types are preserved as-is when passed to
force_unicode(strings_only=True).
"""
- return isinstance(obj, (
- types.NoneType,
- int, long,
+ return isinstance(obj, six.integer_types + (
+ type(None),
datetime.datetime, datetime.date, datetime.time,
float, Decimal)
)
+
def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
"""
- Similar to smart_unicode, except that lazy instances are resolved to
+ Similar to smart_text, except that lazy instances are resolved to
strings, rather than kept as lazy objects.
If strings_only is True, don't convert (some) non-string-like objects.
"""
+ # Handle the common case first for performance reasons.
+ if issubclass(type(s), six.text_type):
+ return s
if strings_only and is_protected_type(s):
return s
try:
- if not isinstance(s, basestring,):
- if hasattr(s, '__unicode__'):
- s = unicode(s)
+ if not issubclass(type(s), six.string_types):
+ if six.PY3:
+ if isinstance(s, bytes):
+ s = six.text_type(s, encoding, errors)
+ else:
+ s = six.text_type(s)
+ elif hasattr(s, '__unicode__'):
+ s = six.text_type(s)
else:
- try:
- s = unicode(str(s), encoding, errors)
- except UnicodeEncodeError:
- if not isinstance(s, Exception):
- raise
- # If we get to here, the caller has passed in an Exception
- # subclass populated with non-ASCII data without special
- # handling to display as a string. We need to handle this
- # without raising a further exception. We do an
- # approximation to what the Exception's standard str()
- # output should be.
- s = ' '.join([force_unicode(arg, encoding, strings_only,
- errors) for arg in s])
- elif not isinstance(s, unicode):
- # Note: We use .decode() here, instead of unicode(s, encoding,
- # errors), so that if s is a SafeString, it ends up being a
- # SafeUnicode at the end.
+ s = six.text_type(bytes(s), encoding, errors)
+ else:
+ # Note: We use .decode() here, instead of six.text_type(s, encoding,
+ # errors), so that if s is a SafeBytes, it ends up being a
+ # SafeText at the end.
s = s.decode(encoding, errors)
- except UnicodeDecodeError, e:
+ except UnicodeDecodeError as e:
if not isinstance(s, Exception):
raise TwitterTextUnicodeDecodeError(s, *e.args)
else:
@@ -66,6 +66,6 @@ def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
# working unicode method. Try to handle this without raising a
# further exception by individually forcing the exception args
# to unicode.
- s = ' '.join([force_unicode(arg, encoding, strings_only,
- errors) for arg in s])
+ s = ' '.join(force_unicode(arg, encoding, strings_only, errors)
+ for arg in s)
return s
diff --git a/twitter_text/validation.py b/twitter_text/validation.py
index 6dea5f9..fb442f3 100644
--- a/twitter_text/validation.py
+++ b/twitter_text/validation.py
@@ -1,5 +1,5 @@
# encoding=utf-8
-
+from __future__ import unicode_literals
import re
from twitter_text.unicode import force_unicode
@@ -75,7 +75,7 @@ def tweet_invalid(self):
if self.tweet_length() > MAX_LENGTH:
valid, validation_error = False, 'Too long'
- if re.search(ur''.join(REGEXEN['invalid_control_characters']), self.text):
+ if re.search(r''.join(REGEXEN['invalid_control_characters']), self.text):
valid, validation_error = False, 'Invalid characters'
if self.parent and hasattr(self.parent, 'tweet_is_valid'):
@@ -97,7 +97,7 @@ def valid_username(self):
return len(extracted) == 1 and extracted[0] == self.text[1:]
def valid_list(self):
- match = re.compile(ur'^%s$' % REGEXEN['valid_mention_or_list'].pattern).search(self.text)
+ match = re.compile(r'^%s$' % REGEXEN['valid_mention_or_list'].pattern).search(self.text)
return bool(match is not None and match.groups()[0] == "" and match.groups()[3])
def valid_hashtag(self):
@@ -124,7 +124,7 @@ def valid_url(self, unicode_domains = True, require_protocol = True):
not require_protocol
or (
self._valid_match(scheme, REGEXEN['validate_url_scheme'])
- and re.compile(ur'^https?$', re.IGNORECASE).match(scheme)
+ and re.compile(r'^https?$', re.IGNORECASE).match(scheme)
)
)
and (