diff --git a/requirements.txt b/requirements.txt index 0ac3552..59a4daa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ argparse==1.2.1 PyYAML==3.10 beautifulsoup4==4.2.0 +six==1.10.0 diff --git a/tests.py b/tests.py index 891b35e..d32d932 100644 --- a/tests.py +++ b/tests.py @@ -1,11 +1,12 @@ # encoding=utf-8 +import six import twitter_text, sys, os, json, argparse, re from twitter_text.unicode import force_unicode narrow_build = True try: - unichr(0x20000) + six.unichr(0x20000) narrow_build = False except: pass @@ -177,4 +178,4 @@ def assert_equal(result, test): sys.stdout.write(u'\033[0m-------\n\033[92m%d tests passed.\033[0m\n' % attempted) sys.stdout.flush() -sys.exit(os.EX_OK) \ No newline at end of file +sys.exit(os.EX_OK) diff --git a/twitter_text/autolink.py b/twitter_text/autolink.py index 17b32c5..9d31aed 100644 --- a/twitter_text/autolink.py +++ b/twitter_text/autolink.py @@ -1,5 +1,5 @@ # encoding=utf-8 - +from __future__ import unicode_literals import re, cgi from twitter_text.regex import REGEXEN @@ -113,7 +113,7 @@ def auto_link_entities(self, entities = [], options = {}): return self.text # NOTE deprecate these attributes not options keys in options hash, then use html_attrs - options = dict(DEFAULT_OPTIONS.items() + options.items()) + options = dict(list(DEFAULT_OPTIONS.items()) + list(options.items())) options['html_attrs'] = self._extract_html_attrs_from_options(options) if not options.get('suppress_no_follow', False): options['html_attrs']['rel'] = "nofollow" @@ -302,16 +302,16 @@ def _link_url_with_entity(self, entity, options = {}): For those URLs, display_url is not a substring of expanded_url, so we don't do anything special to render the elided parts. For a pic.twitter.com URL, the only elided part will be the "https://", so this is fine. """ - display_url = entity.get('display_url').decode('utf-8') + display_url = entity.get('display_url') expanded_url = entity.get('expanded_url') invisible_tag_attrs = options.get('invisible_tag_attrs', DEFAULT_INVISIBLE_TAG_ATTRS) - display_url_sans_ellipses = re.sub(ur'…', u'', display_url) + display_url_sans_ellipses = re.sub(r'…', '', display_url) if expanded_url.find(display_url_sans_ellipses) > -1: before_display_url, after_display_url = expanded_url.split(display_url_sans_ellipses, 2) - preceding_ellipsis = re.search(ur'\A…', display_url) - following_ellipsis = re.search(ur'…\z', display_url) + preceding_ellipsis = re.search(r'\A…', display_url) + following_ellipsis = re.search(r'…\Z', display_url) if preceding_ellipsis is not None: preceding_ellipsis = preceding_ellipsis.group() else: @@ -344,7 +344,7 @@ def _link_url_with_entity(self, entity, options = {}): # … # - return u"%s %s%s%s %s" % (preceding_ellipsis, invisible_tag_attrs, invisible_tag_attrs, self._html_escape(before_display_url), self._html_escape(display_url_sans_ellipses), invisible_tag_attrs, self._html_escape(after_display_url), invisible_tag_attrs, following_ellipsis) + return "%s %s%s%s %s" % (preceding_ellipsis, invisible_tag_attrs, invisible_tag_attrs, self._html_escape(before_display_url), self._html_escape(display_url_sans_ellipses), invisible_tag_attrs, self._html_escape(after_display_url), invisible_tag_attrs, following_ellipsis) else: return self._html_escape(display_url) @@ -356,13 +356,13 @@ def _link_to_hashtag(self, entity, chars, options = {}): if REGEXEN['rtl_chars'].search(hashtag): hashtag_class += ' rtl' - href = options.get('hashtag_url_transform', lambda ht: u'%s%s' % (options.get('hashtag_url_base'), ht))(hashtag) + href = options.get('hashtag_url_transform', lambda ht: '%s%s' % (options.get('hashtag_url_base'), ht))(hashtag) html_attrs = {} html_attrs.update(options.get('html_attrs', {})) html_attrs = { 'class': hashtag_class, - 'title': u'#%s' % hashtag, + 'title': '#%s' % hashtag, } link = self._link_to_text_with_symbol(entity, hashchar, hashtag, href, html_attrs, options) @@ -372,11 +372,11 @@ def _link_to_cashtag(self, entity, chars, options = {}): dollar = chars[entity['indices'][0]] cashtag = entity['cashtag'] - href = options.get('cashtag_url_transform', lambda ct: u'%s%s' % (options.get('cashtag_url_base'), ct))(cashtag) + href = options.get('cashtag_url_transform', lambda ct: '%s%s' % (options.get('cashtag_url_base'), ct))(cashtag) html_attrs = { 'class': options.get('cashtag_class'), - 'title': u'$%s' % cashtag + 'title': '$%s' % cashtag } html_attrs.update(options.get('html_attrs', {})) @@ -384,7 +384,7 @@ def _link_to_cashtag(self, entity, chars, options = {}): return chars[:entity['indices'][0]] + link + chars[entity['indices'][1]:] def _link_to_screen_name(self, entity, chars, options = {}): - name = u'%s%s' % (entity['screen_name'], entity.get('list_slug') or '') + name = '%s%s' % (entity['screen_name'], entity.get('list_slug') or '') chunk = options.get('link_text_transform', default_transform)(entity, name) name = name.lower() @@ -395,30 +395,30 @@ def _link_to_screen_name(self, entity, chars, options = {}): del(html_attrs['title']) if entity.get('list_slug') and not options.get('supress_lists'): - href = options.get('list_url_transform', lambda sn: u'%s%s' % (options.get('list_url_base'), sn))(name) + href = options.get('list_url_transform', lambda sn: '%s%s' % (options.get('list_url_base'), sn))(name) html_attrs['class'] = options.get('list_class') else: - href = options.get('username_url_transform', lambda sn: u'%s%s' % (options.get('username_url_base'), sn))(name) + href = options.get('username_url_transform', lambda sn: '%s%s' % (options.get('username_url_base'), sn))(name) html_attrs['class'] = options.get('username_class') link = self._link_to_text_with_symbol(entity, at, chunk, href, html_attrs, options) return chars[:entity['indices'][0]] + link + chars[entity['indices'][1]:] def _link_to_text_with_symbol(self, entity, symbol, text, href, attributes = {}, options = {}): - tagged_symbol = u'<%s>%s' % (options.get('symbol_tag'), symbol, options.get('symbol_tag')) if options.get('symbol_tag') else symbol + tagged_symbol = '<%s>%s' % (options.get('symbol_tag'), symbol, options.get('symbol_tag')) if options.get('symbol_tag') else symbol text = self._html_escape(text) - tagged_text = u'<%s>%s' % (options.get('text_with_symbol_tag'), text, options.get('text_with_symbol_tag')) if options.get('text_with_symbol_tag') else text + tagged_text = '<%s>%s' % (options.get('text_with_symbol_tag'), text, options.get('text_with_symbol_tag')) if options.get('text_with_symbol_tag') else text if options.get('username_include_symbol') or not REGEXEN['at_signs'].match(symbol): - return u'%s' % self._link_to_text(entity, tagged_symbol + tagged_text, href, attributes, options) + return '%s' % self._link_to_text(entity, tagged_symbol + tagged_text, href, attributes, options) else: - return u'%s%s' % (tagged_symbol, self._link_to_text(entity, tagged_text, href, attributes, options)) + return '%s%s' % (tagged_symbol, self._link_to_text(entity, tagged_text, href, attributes, options)) def _link_to_text(self, entity, text, href, attributes = {}, options = {}): attributes['href'] = href - if options.get('link_attributes_transform'): - attributes = options.get('link_attributes_transform')(entity, attributes) + if options.get('link_attribute_transform'): + attributes = options.get('link_attribute_transform')(entity, attributes) text = options.get('link_text_transform', default_transform)(entity, text) - return u'%s' % (self._tag_attrs(attributes), text) + return '%s' % (self._tag_attrs(attributes), text) def _tag_attrs(self, attributes = {}): attrs = [] @@ -428,7 +428,7 @@ def _tag_attrs(self, attributes = {}): attrs.append(key) continue if type(value) == list: - value = u' '.join(value) - attrs.append(u'%s="%s"' % (self._html_escape(key), self._html_escape(value))) + value = ' '.join(value) + attrs.append('%s="%s"' % (self._html_escape(key), self._html_escape(value))) - return u' '.join(attrs) \ No newline at end of file + return ' '.join(attrs) diff --git a/twitter_text/highlighter.py b/twitter_text/highlighter.py index ec128ca..f352b25 100644 --- a/twitter_text/highlighter.py +++ b/twitter_text/highlighter.py @@ -1,7 +1,7 @@ # encoding=utf-8 - +from __future__ import unicode_literals import re -from HTMLParser import HTMLParser +from six.moves import html_parser from twitter_text.regex import UNICODE_SPACES from twitter_text.unicode import force_unicode @@ -9,7 +9,7 @@ DEFAULT_HIGHLIGHT_TAG = 'em' # from http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python -class MLStripper(HTMLParser): +class MLStripper(html_parser.HTMLParser): def __init__(self): self.reset() self.fed = [] @@ -34,14 +34,14 @@ def hit_highlight(self, hits = [], **kwargs): if not hits and kwargs.get('query'): stripped_text = strip_tags(self.text) - for match in re.finditer(ur'%s' % kwargs.get('query'), stripped_text): + for match in re.finditer(r'%s' % kwargs.get('query'), stripped_text): hits.append(match.span()) if hits and not type(hits) == list: raise Exception('The syntax for the hit_highlight method has changed. You must pass in a list of lists containing the indices of the strings you want to match.') tag_name = kwargs.get('tag', DEFAULT_HIGHLIGHT_TAG) - tags = [u'<%s>' % tag_name, u'' % tag_name] + tags = ['<%s>' % tag_name, '' % tag_name] text = self.text chunks = re.split(r'[<>]', text) @@ -58,7 +58,7 @@ def hit_highlight(self, hits = [], **kwargs): if index % 2: # we're inside a continue - chunk_start = len(u''.join(text_chunks[0:index / 2])) + chunk_start = len(''.join(text_chunks[0:index / 2])) chunk_end = chunk_start + len(chunk) if hit_start >= chunk_start and hit_start < chunk_end: chunk = chunk[:hit_start - chunk_start] + tags[0] + chunk[hit_start - chunk_start:] @@ -76,8 +76,8 @@ def hit_highlight(self, hits = [], **kwargs): for index, chunk in enumerate(chunks): if index % 2: # we're inside a - result.append(u'<%s>' % chunk) + result.append('<%s>' % chunk) else: result.append(chunk) - self.text = u''.join(result) - return self.text \ No newline at end of file + self.text = ''.join(result) + return self.text diff --git a/twitter_text/regex.py b/twitter_text/regex.py index c136f80..2a2d9dc 100644 --- a/twitter_text/regex.py +++ b/twitter_text/regex.py @@ -4,36 +4,40 @@ # list is frozen at load time to ensure immutability. These reular expressions are # used throughout the Twitter classes. Special care has been taken to make # sure these reular expressions work with Tweets in all languages. -import re, string +from __future__ import unicode_literals +import re +import six +from six.moves import reduce + +REGEXEN = {} # :nodoc: -REGEXEN = {} # :nodoc: def regex_range(start, end = None): if end: - return u'%s-%s' % (unichr(start), unichr(end)) + return '%s-%s' % (six.unichr(start), six.unichr(end)) else: - return u'%s' % unichr(start) + return '%s' % six.unichr(start) # Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand # to access both the list of characters and a pattern suitible for use with String#split # Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE UNICODE_SPACES = [] for space in reduce(lambda x,y: x + y if type(y) == list else x + [y], [ - range(0x0009, 0x000D), # White_Space # Cc [5] .. + list(range(0x0009, 0x000D)), # White_Space # Cc [5] .. 0x0020, # White_Space # Zs SPACE 0x0085, # White_Space # Cc 0x00A0, # White_Space # Zs NO-BREAK SPACE 0x1680, # White_Space # Zs OGHAM SPACE MARK 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR - range(0x2000, 0x200A), # White_Space # Zs [11] EN QUAD..HAIR SPACE + list(range(0x2000, 0x200A)), # White_Space # Zs [11] EN QUAD..HAIR SPACE 0x2028, # White_Space # Zl LINE SEPARATOR 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE ]): - UNICODE_SPACES.append(unichr(space)) -REGEXEN['spaces'] = re.compile(ur''.join(UNICODE_SPACES)) + UNICODE_SPACES.append(six.unichr(space)) +REGEXEN['spaces'] = re.compile(r''.join(UNICODE_SPACES)) # Characters not allowed in Tweets INVALID_CHARACTERS = [ @@ -41,9 +45,9 @@ def regex_range(start, end = None): 0xFFFF, # Special 0x202A, 0x202B, 0x202C, 0x202D, 0x202E, # Directional change ] -REGEXEN['invalid_control_characters'] = [unichr(x) for x in INVALID_CHARACTERS] +REGEXEN['invalid_control_characters'] = [six.unichr(x) for x in INVALID_CHARACTERS] -REGEXEN['list_name'] = re.compile(ur'^[a-zA-Z][a-zA-Z0-9_\-\u0080-\u00ff]{0,24}$') +REGEXEN['list_name'] = re.compile(r'^[a-zA-Z][a-zA-Z0-9_\-\u0080-\u00ff]{0,24}$') # Latin accented characters # Excludes 0xd7 from the range (the multiplication sign, confusable with "x"). @@ -67,14 +71,14 @@ def regex_range(start, end = None): regex_range(0x0300, 0x036f), regex_range(0x1e00, 0x1eff), ] -REGEXEN['latin_accents'] = re.compile(ur''.join(LATIN_ACCENTS), re.IGNORECASE | re.UNICODE) -LATIN_ACCENTS = u''.join(LATIN_ACCENTS) +REGEXEN['latin_accents'] = re.compile(r''.join(LATIN_ACCENTS), re.IGNORECASE | re.UNICODE) +LATIN_ACCENTS = ''.join(LATIN_ACCENTS) RTL_CHARACTERS = ''.join([ - regex_range(0x0600,0x06FF), - regex_range(0x0750,0x077F), - regex_range(0x0590,0x05FF), - regex_range(0xFE70,0xFEFF) + regex_range(0x0600, 0x06FF), + regex_range(0x0750, 0x077F), + regex_range(0x0590, 0x05FF), + regex_range(0xFE70, 0xFEFF) ]) NON_LATIN_HASHTAG_CHARS = ''.join([ @@ -147,69 +151,69 @@ def regex_range(start, end = None): # this is a narrow python build so these extended Kanji characters won't work pass -PUNCTUATION_CHARS = ur'!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~' -SPACE_CHARS = ur" \t\n\x0B\f\r" -CTRL_CHARS = ur"\x00-\x1F\x7F" +PUNCTUATION_CHARS = r'!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~' +SPACE_CHARS = r" \t\n\x0B\f\r" +CTRL_CHARS = r"\x00-\x1F\x7F" # A hashtag must contain latin characters, numbers and underscores, but not all numbers. -HASHTAG_ALPHA = ur'[a-z_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS) -HASHTAG_ALPHANUMERIC = ur'[a-z0-9_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS) -HASHTAG_BOUNDARY = ur'\A|\z|\[|[^&a-z0-9_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS) +HASHTAG_ALPHA = r'[a-z_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS) +HASHTAG_ALPHANUMERIC = r'[a-z0-9_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS) +HASHTAG_BOUNDARY = r'\A|\Z|\[|[^&a-z0-9_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS) -HASHTAG = re.compile(ur'(%s)(#|#)(%s*%s%s*)' % (HASHTAG_BOUNDARY, HASHTAG_ALPHANUMERIC, HASHTAG_ALPHA, HASHTAG_ALPHANUMERIC), re.IGNORECASE) +HASHTAG = re.compile(r'(%s)(#|#)(%s*%s%s*)' % (HASHTAG_BOUNDARY, HASHTAG_ALPHANUMERIC, HASHTAG_ALPHA, HASHTAG_ALPHANUMERIC), re.IGNORECASE) REGEXEN['valid_hashtag'] = HASHTAG -REGEXEN['end_hashtag_match'] = re.compile(ur'\A(?:[##]|:\/\/)', re.IGNORECASE | re.UNICODE) -REGEXEN['numeric_only'] = re.compile(ur'^[\d]+$') +REGEXEN['end_hashtag_match'] = re.compile(r'\A(?:[##]|://)', re.IGNORECASE | re.UNICODE) +REGEXEN['numeric_only'] = re.compile(r'^[\d]+$') -REGEXEN['valid_mention_preceding_chars'] = re.compile(r'(?:[^a-zA-Z0-9_!#\$%&*@@]|^|RT:?)') -REGEXEN['at_signs'] = re.compile(ur'[@@]') +REGEXEN['valid_mention_preceding_chars'] = re.compile(r'(?:[^a-zA-Z0-9_!#$%&*@@]|^|RT:?)') +REGEXEN['at_signs'] = re.compile(r'[@@]') REGEXEN['valid_mention_or_list'] = re.compile( - ur'(%s)' % REGEXEN['valid_mention_preceding_chars'].pattern.decode('utf-8') + # preceding character - ur'(%s)' % REGEXEN['at_signs'].pattern + # at mark - ur'([a-zA-Z0-9_]{1,20})' + # screen name - ur'(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?' # list (optional) + r'(%s)' % REGEXEN['valid_mention_preceding_chars'].pattern + # preceding character + r'(%s)' % REGEXEN['at_signs'].pattern + # at mark + r'([a-zA-Z0-9_]{1,20})' + # screen name + r'(/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?' # list (optional) ) -REGEXEN['valid_reply'] = re.compile(ur'^(?:[%s])*%s([a-zA-Z0-9_]{1,20})' % (REGEXEN['spaces'].pattern, REGEXEN['at_signs'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['valid_reply'] = re.compile(r'^(?:[%s])*%s([a-zA-Z0-9_]{1,20})' % (REGEXEN['spaces'].pattern, REGEXEN['at_signs'].pattern), re.IGNORECASE | re.UNICODE) # Used in Extractor for final filtering -REGEXEN['end_mention_match'] = re.compile(ur'\A(?:%s|[%s]|:\/\/)' % (REGEXEN['at_signs'].pattern, REGEXEN['latin_accents'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['end_mention_match'] = re.compile(r'\A(?:%s|[%s]|://)' % (REGEXEN['at_signs'].pattern, REGEXEN['latin_accents'].pattern), re.IGNORECASE | re.UNICODE) # URL related hash regex collection -REGEXEN['valid_url_preceding_chars'] = re.compile(ur'(?:[^A-Z0-9@@$##%s]|^)' % ur''.join(REGEXEN['invalid_control_characters']), re.IGNORECASE | re.UNICODE) -REGEXEN['invalid_url_without_protocol_preceding_chars'] = re.compile(ur'[-_.\/]$') -DOMAIN_VALID_CHARS = ur'[^%s%s%s%s%s]' % (PUNCTUATION_CHARS, SPACE_CHARS, CTRL_CHARS, ur''.join(REGEXEN['invalid_control_characters']), ur''.join(UNICODE_SPACES)) -REGEXEN['valid_subdomain'] = re.compile(ur'(?:(?:%s(?:[_-]|%s)*)?%s\.)' % (DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS), re.IGNORECASE | re.UNICODE) -REGEXEN['valid_domain_name'] = re.compile(ur'(?:(?:%s(?:[-]|%s)*)?%s\.)' % (DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS), re.IGNORECASE | re.UNICODE) -REGEXEN['valid_gTLD'] = re.compile(ur'(?:(?:academy|actor|aero|agency|arpa|asia|bar|bargains|berlin|best|bid|bike|biz|blue|boutique|build|builders|buzz|cab|camera|camp|cards|careers|cat|catering|center|ceo|cheap|christmas|cleaning|clothing|club|codes|coffee|com|community|company|computer|construction|contractors|cool|coop|cruises|dance|dating|democrat|diamonds|directory|domains|edu|education|email|enterprises|equipment|estate|events|expert|exposed|farm|fish|flights|florist|foundation|futbol|gallery|gift|glass|gov|graphics|guitars|guru|holdings|holiday|house|immobilien|industries|info|institute|int|international|jobs|kaufen|kim|kitchen|kiwi|koeln|kred|land|lighting|limo|link|luxury|management|mango|marketing|menu|mil|mobi|moda|monash|museum|nagoya|name|net|neustar|ninja|okinawa|onl|org|partners|parts|photo|photography|photos|pics|pink|plumbing|post|pro|productions|properties|pub|qpon|recipes|red|rentals|repair|report|reviews|rich|ruhr|sexy|shiksha|shoes|singles|social|solar|solutions|supplies|supply|support|systems|tattoo|technology|tel|tienda|tips|today|tokyo|tools|training|travel|uno|vacations|ventures|viajes|villas|vision|vote|voting|voto|voyage|wang|watch|wed|wien|wiki|works|xxx|xyz|zone|дети|онлайн|орг|сайт|بازار|شبكة|みんな|中信|中文网|公司|公>益|在线|我爱你|政务|游戏|移动|网络|集团|삼성)(?=[^0-9a-z]|$))', re.IGNORECASE | re.UNICODE) -REGEXEN['valid_ccTLD'] = re.compile(ur'(?:(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bl|bm|bn|bo|bq|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cw|cx|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mf|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw|мон|рф|срб|укр|қаз|الاردن|الجزائر|السعودية|المغرب|امارات|ایران|بھارت|تونس|سودان|سورية|عمان|فلسطين|قطر|مصر|مليسيا|پاکستان|भारत|বাংলা|ভারত|ਭਾਰਤ|ભારત|இந்தியா|இலங்கை|சிங்கப்பூர்|భారత్|ලංකා|ไทย|გე|中国|中加坡|湾|台灣|新香港|한국)(?=[^0-9a-z]|$))', re.IGNORECASE | re.UNICODE) -REGEXEN['valid_punycode'] = re.compile(ur'(?:xn--[0-9a-z]+)', re.IGNORECASE | re.UNICODE) +REGEXEN['valid_url_preceding_chars'] = re.compile(r'(?:[^A-Z0-9@@$##%s]|^)' % r''.join(REGEXEN['invalid_control_characters']), re.IGNORECASE | re.UNICODE) +REGEXEN['invalid_url_without_protocol_preceding_chars'] = re.compile(r'[-_./]$') +DOMAIN_VALID_CHARS = r'[^%s%s%s%s%s]' % (PUNCTUATION_CHARS, SPACE_CHARS, CTRL_CHARS, r''.join(REGEXEN['invalid_control_characters']), r''.join(UNICODE_SPACES)) +REGEXEN['valid_subdomain'] = re.compile(r'(?:(?:%s(?:[_-]|%s)*)?%s\.)' % (DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS), re.IGNORECASE | re.UNICODE) +REGEXEN['valid_domain_name'] = re.compile(r'(?:(?:%s(?:[-]|%s)*)?%s\.)' % (DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS), re.IGNORECASE | re.UNICODE) +REGEXEN['valid_gTLD'] = re.compile(r'(?:(?:academy|actor|aero|agency|arpa|asia|bar|bargains|berlin|best|bid|bike|biz|blue|boutique|build|builders|buzz|cab|camera|camp|cards|careers|cat|catering|center|ceo|cheap|christmas|cleaning|clothing|club|codes|coffee|com|community|company|computer|construction|contractors|cool|coop|cruises|dance|dating|democrat|diamonds|directory|domains|edu|education|email|enterprises|equipment|estate|events|expert|exposed|farm|fish|flights|florist|foundation|futbol|gallery|gift|glass|gov|graphics|guitars|guru|holdings|holiday|house|immobilien|industries|info|institute|int|international|jobs|kaufen|kim|kitchen|kiwi|koeln|kred|land|lighting|limo|link|luxury|management|mango|marketing|menu|mil|mobi|moda|monash|museum|nagoya|name|net|neustar|ninja|okinawa|onl|org|partners|parts|photo|photography|photos|pics|pink|plumbing|post|pro|productions|properties|pub|qpon|recipes|red|rentals|repair|report|reviews|rich|ruhr|sexy|shiksha|shoes|singles|social|solar|solutions|supplies|supply|support|systems|tattoo|technology|tel|tienda|tips|today|tokyo|tools|training|travel|uno|vacations|ventures|viajes|villas|vision|vote|voting|voto|voyage|wang|watch|wed|wien|wiki|works|xxx|xyz|zone|дети|онлайн|орг|сайт|بازار|شبكة|みんな|中信|中文网|公司|公>益|在线|我爱你|政务|游戏|移动|网络|集团|삼성)(?=[^0-9a-z]|$))', re.IGNORECASE | re.UNICODE) +REGEXEN['valid_ccTLD'] = re.compile(r'(?:(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bl|bm|bn|bo|bq|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cw|cx|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mf|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw|мон|рф|срб|укр|қаз|الاردن|الجزائر|السعودية|المغرب|امارات|ایران|بھارت|تونس|سودان|سورية|عمان|فلسطين|قطر|مصر|مليسيا|پاکستان|भारत|বাংলা|ভারত|ਭਾਰਤ|ભારત|இந்தியா|இலங்கை|சிங்கப்பூர்|భారత్|ලංකා|ไทย|გე|中国|中加坡|湾|台灣|新香港|한국)(?=[^0-9a-z]|$))', re.IGNORECASE | re.UNICODE) +REGEXEN['valid_punycode'] = re.compile(r'(?:xn--[0-9a-z]+)', re.IGNORECASE | re.UNICODE) -REGEXEN['valid_domain'] = re.compile(ur'(?:%s*%s(?:%s|%s|%s))' % (REGEXEN['valid_subdomain'].pattern, REGEXEN['valid_domain_name'].pattern, REGEXEN['valid_gTLD'].pattern, REGEXEN['valid_ccTLD'].pattern, REGEXEN['valid_punycode'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['valid_domain'] = re.compile(r'(?:%s*%s(?:%s|%s|%s))' % (REGEXEN['valid_subdomain'].pattern, REGEXEN['valid_domain_name'].pattern, REGEXEN['valid_gTLD'].pattern, REGEXEN['valid_ccTLD'].pattern, REGEXEN['valid_punycode'].pattern), re.IGNORECASE | re.UNICODE) # This is used in Extractor -REGEXEN['valid_ascii_domain'] = re.compile(ur'(?:(?:[A-Za-z0-9\-_]|[%s])+\.)+(?:%s|%s|%s)' % (REGEXEN['latin_accents'].pattern, REGEXEN['valid_gTLD'].pattern, REGEXEN['valid_ccTLD'].pattern, REGEXEN['valid_punycode'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['valid_ascii_domain'] = re.compile(r'(?:(?:[A-Za-z0-9\-_]|[%s])+\.)+(?:%s|%s|%s)' % (REGEXEN['latin_accents'].pattern, REGEXEN['valid_gTLD'].pattern, REGEXEN['valid_ccTLD'].pattern, REGEXEN['valid_punycode'].pattern), re.IGNORECASE | re.UNICODE) # This is used in Extractor for stricter t.co URL extraction -REGEXEN['valid_tco_url'] = re.compile(ur'^https?:\/\/t\.co\/[a-z0-9]+', re.IGNORECASE | re.UNICODE) +REGEXEN['valid_tco_url'] = re.compile(r'^https?://t\.co/[a-z0-9]+', re.IGNORECASE | re.UNICODE) # This is used in Extractor to filter out unwanted URLs. -REGEXEN['invalid_short_domain'] = re.compile(ur'\A%s%s\Z' % (REGEXEN['valid_domain_name'].pattern, REGEXEN['valid_ccTLD'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['invalid_short_domain'] = re.compile(r'\A%s%s\Z' % (REGEXEN['valid_domain_name'].pattern, REGEXEN['valid_ccTLD'].pattern), re.IGNORECASE | re.UNICODE) -REGEXEN['valid_port_number'] = re.compile(ur'[0-9]+') +REGEXEN['valid_port_number'] = re.compile(r'[0-9]+') -REGEXEN['valid_general_url_path_chars'] = re.compile(ur"[a-z0-9!\*';:=\+\,\.\$\/%%#\[\]\-_~&|@%s]" % LATIN_ACCENTS, re.IGNORECASE | re.UNICODE) +REGEXEN['valid_general_url_path_chars'] = re.compile(r"[a-z0-9!*';:=+,.$/%%#\[\]\-_~&|@%s]" % LATIN_ACCENTS, re.IGNORECASE | re.UNICODE) # Allow URL paths to contain balanced parens # 1. Used in Wikipedia URLs like /Primer_(film) # 2. Used in IIS sessions like /S(dfd346)/ -REGEXEN['valid_url_balanced_parens'] = re.compile(ur'\(%s+\)' % REGEXEN['valid_general_url_path_chars'].pattern, re.IGNORECASE | re.UNICODE) +REGEXEN['valid_url_balanced_parens'] = re.compile(r'\(%s+\)' % REGEXEN['valid_general_url_path_chars'].pattern, re.IGNORECASE | re.UNICODE) # Valid end-of-path chracters (so /foo. does not gobble the period). # 1. Allow =&# for empty URL parameters and other URL-join artifacts -REGEXEN['valid_url_path_ending_chars'] = re.compile(ur'[a-z0-9=_#\/\+\-%s]|(?:%s)' % (LATIN_ACCENTS, REGEXEN['valid_url_balanced_parens'].pattern), re.IGNORECASE | re.UNICODE) -REGEXEN['valid_url_path'] = re.compile(ur'(?:(?:%s*(?:%s %s*)*%s)|(?:%s+\/))' % (REGEXEN['valid_general_url_path_chars'].pattern, REGEXEN['valid_url_balanced_parens'].pattern, REGEXEN['valid_general_url_path_chars'].pattern, REGEXEN['valid_url_path_ending_chars'].pattern, REGEXEN['valid_general_url_path_chars'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['valid_url_path_ending_chars'] = re.compile(r'[a-z0-9=_#/+\-%s]|(?:%s)' % (LATIN_ACCENTS, REGEXEN['valid_url_balanced_parens'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['valid_url_path'] = re.compile(r'(?:(?:%s*(?:%s %s*)*%s)|(?:%s+/))' % (REGEXEN['valid_general_url_path_chars'].pattern, REGEXEN['valid_url_balanced_parens'].pattern, REGEXEN['valid_general_url_path_chars'].pattern, REGEXEN['valid_url_path_ending_chars'].pattern, REGEXEN['valid_general_url_path_chars'].pattern), re.IGNORECASE | re.UNICODE) -REGEXEN['valid_url_query_chars'] = re.compile(ur"[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]", re.IGNORECASE | re.UNICODE) -REGEXEN['valid_url_query_ending_chars'] = re.compile(ur'[a-z0-9_&=#\/]', re.IGNORECASE | re.UNICODE) -REGEXEN['valid_url'] = re.compile(ur'((%s)((https?:\/\/)?(%s)(?::(%s))?(/%s*)?(\?%s*%s)?))' % ( +REGEXEN['valid_url_query_chars'] = re.compile(r"[a-z0-9!?*'();:&=+$/%#\[\]\-_.,~|@]", re.IGNORECASE | re.UNICODE) +REGEXEN['valid_url_query_ending_chars'] = re.compile(r'[a-z0-9_&=#/]', re.IGNORECASE | re.UNICODE) +REGEXEN['valid_url'] = re.compile(r'((%s)((https?://)?(%s)(?::(%s))?(/%s*)?(\?%s*%s)?))' % ( REGEXEN['valid_url_preceding_chars'].pattern, REGEXEN['valid_domain'].pattern, REGEXEN['valid_port_number'].pattern, @@ -227,54 +231,54 @@ def regex_range(start, end = None): # $7 URL Path and anchor # $8 Query String -REGEXEN['cashtag'] = re.compile(ur'[a-z]{1,6}(?:[._][a-z]{1,2})?', re.IGNORECASE) -REGEXEN['valid_cashtag'] = re.compile(ur'(^|[%s])(\$|$|﹩)(%s)(?=$|\s|[%s])' % (REGEXEN['spaces'].pattern, REGEXEN['cashtag'].pattern, PUNCTUATION_CHARS), re.IGNORECASE) +REGEXEN['cashtag'] = re.compile(r'[a-z]{1,6}(?:[._][a-z]{1,2})?', re.IGNORECASE) +REGEXEN['valid_cashtag'] = re.compile(r'(^|[%s])(\$|$|﹩)(%s)(?=$|\s|[%s])' % (REGEXEN['spaces'].pattern, REGEXEN['cashtag'].pattern, PUNCTUATION_CHARS), re.IGNORECASE) # These URL validation pattern strings are based on the ABNF from RFC 3986 -REGEXEN['validate_url_unreserved'] = re.compile(ur'[a-z0-9\-._~]', re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_pct_encoded'] = re.compile(ur'(?:%[0-9a-f]{2})', re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_sub_delims'] = re.compile(ur"[!$&'()*+,;=]", re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_pchar'] = re.compile(ur'(?:%s|%s|%s|[:\|@])' % (REGEXEN['validate_url_unreserved'].pattern, REGEXEN['validate_url_pct_encoded'].pattern, REGEXEN['validate_url_sub_delims'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_unreserved'] = re.compile(r'[a-z0-9\-._~]', re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_pct_encoded'] = re.compile(r'(?:%[0-9a-f]{2})', re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_sub_delims'] = re.compile(r"[!$&'()*+,;=]", re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_pchar'] = re.compile(r'(?:%s|%s|%s|[:|@])' % (REGEXEN['validate_url_unreserved'].pattern, REGEXEN['validate_url_pct_encoded'].pattern, REGEXEN['validate_url_sub_delims'].pattern), re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_scheme'] = re.compile(ur'(?:[a-z][a-z0-9+\-.]*)', re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_userinfo'] = re.compile(ur'(?:%s|%s|%s|:)*' % (REGEXEN['validate_url_unreserved'].pattern, REGEXEN['validate_url_pct_encoded'].pattern, REGEXEN['validate_url_sub_delims'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_scheme'] = re.compile(r'(?:[a-z][a-z0-9+\-.]*)', re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_userinfo'] = re.compile(r'(?:%s|%s|%s|:)*' % (REGEXEN['validate_url_unreserved'].pattern, REGEXEN['validate_url_pct_encoded'].pattern, REGEXEN['validate_url_sub_delims'].pattern), re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_dec_octet'] = re.compile(ur'(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))', re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_ipv4'] = re.compile(ur'(?:%s(?:\.%s){3})' % (REGEXEN['validate_url_dec_octet'].pattern, REGEXEN['validate_url_dec_octet'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_dec_octet'] = re.compile(r'(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))', re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_ipv4'] = re.compile(r'(?:%s(?:\.%s){3})' % (REGEXEN['validate_url_dec_octet'].pattern, REGEXEN['validate_url_dec_octet'].pattern), re.IGNORECASE | re.UNICODE) # Punting on real IPv6 validation for now -REGEXEN['validate_url_ipv6'] = re.compile(ur'(?:\[[a-f0-9:\.]+\])', re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_ipv6'] = re.compile(r'(?:\[[a-f0-9:.]+\])', re.IGNORECASE | re.UNICODE) # Also punting on IPvFuture for now -REGEXEN['validate_url_ip'] = re.compile(ur'(?:%s|%s)' % (REGEXEN['validate_url_ipv4'].pattern, REGEXEN['validate_url_ipv6'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_ip'] = re.compile(r'(?:%s|%s)' % (REGEXEN['validate_url_ipv4'].pattern, REGEXEN['validate_url_ipv6'].pattern), re.IGNORECASE | re.UNICODE) # This is more strict than the rfc specifies -REGEXEN['validate_url_subdomain_segment'] = re.compile(ur'(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)', re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_domain_segment'] = re.compile(ur'(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)', re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_domain_tld'] = re.compile(ur'(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)', re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_domain'] = re.compile(ur'(?:(?:%s\.)*(?:%s\.)%s)' % (REGEXEN['validate_url_subdomain_segment'].pattern, REGEXEN['validate_url_domain_segment'].pattern, REGEXEN['validate_url_domain_tld'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_subdomain_segment'] = re.compile(r'(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)', re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_domain_segment'] = re.compile(r'(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)', re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_domain_tld'] = re.compile(r'(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)', re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_domain'] = re.compile(r'(?:(?:%s\.)*(?:%s\.)%s)' % (REGEXEN['validate_url_subdomain_segment'].pattern, REGEXEN['validate_url_domain_segment'].pattern, REGEXEN['validate_url_domain_tld'].pattern), re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_host'] = re.compile(ur'(?:%s|%s)' % (REGEXEN['validate_url_ip'].pattern, REGEXEN['validate_url_domain'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_host'] = re.compile(r'(?:%s|%s)' % (REGEXEN['validate_url_ip'].pattern, REGEXEN['validate_url_domain'].pattern), re.IGNORECASE | re.UNICODE) # Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences -REGEXEN['validate_url_unicode_subdomain_segment'] = re.compile(ur'(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9_\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)', re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_unicode_domain_segment'] = re.compile(ur'(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)', re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_unicode_domain_tld'] = re.compile(ur'(?:(?:[a-z]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)', re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_unicode_domain'] = re.compile(ur'(?:(?:%s\.)*(?:%s\.)%s)' % (REGEXEN['validate_url_unicode_subdomain_segment'].pattern, REGEXEN['validate_url_unicode_domain_segment'].pattern, REGEXEN['validate_url_unicode_domain_tld'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_unicode_subdomain_segment'] = re.compile(r'(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9_\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)', re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_unicode_domain_segment'] = re.compile(r'(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)', re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_unicode_domain_tld'] = re.compile(r'(?:(?:[a-z]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)', re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_unicode_domain'] = re.compile(r'(?:(?:%s\.)*(?:%s\.)%s)' % (REGEXEN['validate_url_unicode_subdomain_segment'].pattern, REGEXEN['validate_url_unicode_domain_segment'].pattern, REGEXEN['validate_url_unicode_domain_tld'].pattern), re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_unicode_host'] = re.compile(ur'(?:%s|%s)' % (REGEXEN['validate_url_ip'].pattern, REGEXEN['validate_url_unicode_domain'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_unicode_host'] = re.compile(r'(?:%s|%s)' % (REGEXEN['validate_url_ip'].pattern, REGEXEN['validate_url_unicode_domain'].pattern), re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_port'] = re.compile(ur'[0-9]{1,5}') +REGEXEN['validate_url_port'] = re.compile(r'[0-9]{1,5}') -REGEXEN['validate_url_unicode_authority'] = re.compile(ur'(?:(%s)@)?(%s)(?::(%s))?' % (REGEXEN['validate_url_userinfo'].pattern, REGEXEN['validate_url_unicode_host'].pattern, REGEXEN['validate_url_port'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_unicode_authority'] = re.compile(r'(?:(%s)@)?(%s)(?::(%s))?' % (REGEXEN['validate_url_userinfo'].pattern, REGEXEN['validate_url_unicode_host'].pattern, REGEXEN['validate_url_port'].pattern), re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_authority'] = re.compile(ur'(?:(%s)@)?(%s)(?::(%s))?' % (REGEXEN['validate_url_userinfo'].pattern, REGEXEN['validate_url_host'].pattern, REGEXEN['validate_url_port'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_authority'] = re.compile(r'(?:(%s)@)?(%s)(?::(%s))?' % (REGEXEN['validate_url_userinfo'].pattern, REGEXEN['validate_url_host'].pattern, REGEXEN['validate_url_port'].pattern), re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_path'] = re.compile(ur'(/%s*)*' % REGEXEN['validate_url_pchar'].pattern, re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_query'] = re.compile(ur'(%s|/|\?)*' % REGEXEN['validate_url_pchar'].pattern, re.IGNORECASE | re.UNICODE) -REGEXEN['validate_url_fragment'] = re.compile(ur'(%s|/|\?)*' % REGEXEN['validate_url_pchar'].pattern, re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_path'] = re.compile(r'(/%s*)*' % REGEXEN['validate_url_pchar'].pattern, re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_query'] = re.compile(r'(%s|/|\?)*' % REGEXEN['validate_url_pchar'].pattern, re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_fragment'] = re.compile(r'(%s|/|\?)*' % REGEXEN['validate_url_pchar'].pattern, re.IGNORECASE | re.UNICODE) # Modified version of RFC 3986 Appendix B -REGEXEN['validate_url_unencoded'] = re.compile(ur'\A(?:([^:/?#]+)://)?([^/?#]*)([^?#]*)(?:\?([^#]*))?(?:\#(.*))?\Z', re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_unencoded'] = re.compile(r'\A(?:([^:/?#]+)://)?([^/?#]*)([^?#]*)(?:\?([^#]*))?(?:#(.*))?\Z', re.IGNORECASE | re.UNICODE) -REGEXEN['rtl_chars'] = re.compile(ur'[%s]' % RTL_CHARACTERS, re.IGNORECASE | re.UNICODE) +REGEXEN['rtl_chars'] = re.compile(r'[%s]' % RTL_CHARACTERS, re.IGNORECASE | re.UNICODE) diff --git a/twitter_text/unicode.py b/twitter_text/unicode.py index 4e17267..e62eda5 100644 --- a/twitter_text/unicode.py +++ b/twitter_text/unicode.py @@ -1,6 +1,9 @@ -import types, datetime +import datetime from decimal import Decimal +import six + + # borrowed from django.utils.encoding class TwitterTextUnicodeDecodeError(UnicodeDecodeError): def __init__(self, obj, *args): @@ -12,52 +15,49 @@ def __str__(self): return '%s. You passed in %r (%s)' % (original, self.obj, type(self.obj)) + def is_protected_type(obj): """Determine if the object instance is of a protected type. Objects of protected types are preserved as-is when passed to force_unicode(strings_only=True). """ - return isinstance(obj, ( - types.NoneType, - int, long, + return isinstance(obj, six.integer_types + ( + type(None), datetime.datetime, datetime.date, datetime.time, float, Decimal) ) + def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'): """ - Similar to smart_unicode, except that lazy instances are resolved to + Similar to smart_text, except that lazy instances are resolved to strings, rather than kept as lazy objects. If strings_only is True, don't convert (some) non-string-like objects. """ + # Handle the common case first for performance reasons. + if issubclass(type(s), six.text_type): + return s if strings_only and is_protected_type(s): return s try: - if not isinstance(s, basestring,): - if hasattr(s, '__unicode__'): - s = unicode(s) + if not issubclass(type(s), six.string_types): + if six.PY3: + if isinstance(s, bytes): + s = six.text_type(s, encoding, errors) + else: + s = six.text_type(s) + elif hasattr(s, '__unicode__'): + s = six.text_type(s) else: - try: - s = unicode(str(s), encoding, errors) - except UnicodeEncodeError: - if not isinstance(s, Exception): - raise - # If we get to here, the caller has passed in an Exception - # subclass populated with non-ASCII data without special - # handling to display as a string. We need to handle this - # without raising a further exception. We do an - # approximation to what the Exception's standard str() - # output should be. - s = ' '.join([force_unicode(arg, encoding, strings_only, - errors) for arg in s]) - elif not isinstance(s, unicode): - # Note: We use .decode() here, instead of unicode(s, encoding, - # errors), so that if s is a SafeString, it ends up being a - # SafeUnicode at the end. + s = six.text_type(bytes(s), encoding, errors) + else: + # Note: We use .decode() here, instead of six.text_type(s, encoding, + # errors), so that if s is a SafeBytes, it ends up being a + # SafeText at the end. s = s.decode(encoding, errors) - except UnicodeDecodeError, e: + except UnicodeDecodeError as e: if not isinstance(s, Exception): raise TwitterTextUnicodeDecodeError(s, *e.args) else: @@ -66,6 +66,6 @@ def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'): # working unicode method. Try to handle this without raising a # further exception by individually forcing the exception args # to unicode. - s = ' '.join([force_unicode(arg, encoding, strings_only, - errors) for arg in s]) + s = ' '.join(force_unicode(arg, encoding, strings_only, errors) + for arg in s) return s diff --git a/twitter_text/validation.py b/twitter_text/validation.py index 6dea5f9..fb442f3 100644 --- a/twitter_text/validation.py +++ b/twitter_text/validation.py @@ -1,5 +1,5 @@ # encoding=utf-8 - +from __future__ import unicode_literals import re from twitter_text.unicode import force_unicode @@ -75,7 +75,7 @@ def tweet_invalid(self): if self.tweet_length() > MAX_LENGTH: valid, validation_error = False, 'Too long' - if re.search(ur''.join(REGEXEN['invalid_control_characters']), self.text): + if re.search(r''.join(REGEXEN['invalid_control_characters']), self.text): valid, validation_error = False, 'Invalid characters' if self.parent and hasattr(self.parent, 'tweet_is_valid'): @@ -97,7 +97,7 @@ def valid_username(self): return len(extracted) == 1 and extracted[0] == self.text[1:] def valid_list(self): - match = re.compile(ur'^%s$' % REGEXEN['valid_mention_or_list'].pattern).search(self.text) + match = re.compile(r'^%s$' % REGEXEN['valid_mention_or_list'].pattern).search(self.text) return bool(match is not None and match.groups()[0] == "" and match.groups()[3]) def valid_hashtag(self): @@ -124,7 +124,7 @@ def valid_url(self, unicode_domains = True, require_protocol = True): not require_protocol or ( self._valid_match(scheme, REGEXEN['validate_url_scheme']) - and re.compile(ur'^https?$', re.IGNORECASE).match(scheme) + and re.compile(r'^https?$', re.IGNORECASE).match(scheme) ) ) and (