From 163f8bee9cf9e711e2056e5bb50d3ee5f7dfd9e6 Mon Sep 17 00:00:00 2001 From: Mahmoud Hashemi Date: Sat, 13 Jan 2018 14:25:35 -0800 Subject: [PATCH 1/7] WIP: initial version of new regex-based parse_host --- hyperlink/_url_codecs.py | 97 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 hyperlink/_url_codecs.py diff --git a/hyperlink/_url_codecs.py b/hyperlink/_url_codecs.py new file mode 100644 index 00000000..c978f3b8 --- /dev/null +++ b/hyperlink/_url_codecs.py @@ -0,0 +1,97 @@ + +import re +import socket + +# RFC 3986 Section 2.3, Unreserved URI Characters +# https://tools.ietf.org/html/rfc3986#section-2.3 +_UNRESERVED_CHARS = frozenset('~-._0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ' + 'abcdefghijklmnopqrstuvwxyz') + +# RFC 3986 section 2.2, Reserved Characters +# https://tools.ietf.org/html/rfc3986#section-2.2 +_GEN_DELIMS = frozenset(u':/?#[]@') +_SUB_DELIMS = frozenset(u"!$&'()*+,;=") +_ALL_DELIMS = _GEN_DELIMS | _SUB_DELIMS + + +# The following is based on Ian Cordasco's rfc3986 package + +IPv4_PATT = '([0-9]{1,3}.){3}[0-9]{1,3}' +IPv4_RE = re.compile(IPv4_PATT) +# Hexadecimal characters used in each piece of an IPv6 address +HEXDIG_PATT = '[0-9A-Fa-f]{1,4}' +# Least-significant 32 bits of an IPv6 address +LS32_RE = '(%(hex)s:%(hex)s|%(ipv4)s)' % {'hex': HEXDIG_PATT, 'ipv4': IPv4_PATT} +# Substitutions into the following patterns for IPv6 patterns defined +# http://tools.ietf.org/html/rfc3986#page-20 +_subs = {'hex': HEXDIG_PATT, 'ls32': LS32_RE} + +# Below: h16 = hexdig, see: https://tools.ietf.org/html/rfc5234 for details +# about ABNF (Augmented Backus-Naur Form) use in the comments +_ipv6_variations = [ + # 6( h16 ":" ) ls32 + '(%(hex)s:){6}%(ls32)s' % _subs, + # "::" 5( h16 ":" ) ls32 + '::(%(hex)s:){5}%(ls32)s' % _subs, + # [ h16 ] "::" 4( h16 ":" ) ls32 + '(%(hex)s)?::(%(hex)s:){4}%(ls32)s' % _subs, + # [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 + '((%(hex)s:)?%(hex)s)?::(%(hex)s:){3}%(ls32)s' % _subs, + # [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 + '((%(hex)s:){0,2}%(hex)s)?::(%(hex)s:){2}%(ls32)s' % _subs, + # [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 + '((%(hex)s:){0,3}%(hex)s)?::%(hex)s:%(ls32)s' % _subs, + # [ *4( h16 ":" ) h16 ] "::" ls32 + '((%(hex)s:){0,4}%(hex)s)?::%(ls32)s' % _subs, + # [ *5( h16 ":" ) h16 ] "::" h16 + '((%(hex)s:){0,5}%(hex)s)?::%(hex)s' % _subs, + # [ *6( h16 ":" ) h16 ] "::" + '((%(hex)s:){0,6}%(hex)s)?::' % _subs, +] + +IPv6_PATT = '(%s)' % '|'.join(['(%s)' % v for v in _ipv6_variations]) + +PERCENT_ENCODED_PATT = '%[A-Fa-f0-9]{2}' + +UNRESERVED_CHAR_PATT = 'A-Za-z0-9._~\-' +SUBDELIMS_CHAR_PATT = "!$&'()\*+,;=" + +IPv_FUTURE_PATT = ('v[0-9A-Fa-f]+.[%s]+' + % UNRESERVED_CHAR_PATT + SUBDELIMS_CHAR_PATT + ':') + + +# RFC 6874 Zone ID ABNF +ZONE_ID_PATT = '(?:[' + UNRESERVED_CHAR_PATT + ']|' + PERCENT_ENCODED_PATT + ')+' +IPv6_ADDRZ_PATT = IPv6_PATT + '%25' + ZONE_ID_PATT + +IP_LITERAL_PATT = ('\[(%s|(?:%s)|%s)\]' + % (IPv6_PATT, IPv6_ADDRZ_PATT, IPv_FUTURE_PATT)) + + +_IP_LITERAL_RE = re.compile(IP_LITERAL_PATT) + + +def parse_host(host): + if u':' in host: + try: + _IP_LITERAL_RE.match(host) + # TODO: pull out lowest 32-bits in case of ipv4-in-ipv6 + # pattern match and inet_pton them + ipv4_match = IPv4_RE.search(host) + if ipv4_match: + try: + socket.inet_pton(socket.AF_INET, ipv4_match.group(0)) + except socket.error as se: + raise ValueError('invalid IPv6 host with IPv4: %r (%r)' % (host, se)) + except socket.error as se: + # TODO: URLParseError + raise ValueError('invalid IPv6 host: %r (%r)' % (host, se)) + else: + return socket.AF_INET6, host + try: + socket.inet_pton(socket.AF_INET, host) + except (socket.error, UnicodeEncodeError): + family = None # not an IP + else: + family = socket.AF_INET + return family, host From 22550ad881dd0dded1ec0a58374b15d362f99f96 Mon Sep 17 00:00:00 2001 From: Mahmoud Hashemi Date: Sat, 13 Jan 2018 16:38:02 -0800 Subject: [PATCH 2/7] WIP: working on new parse_host, fixed a bug or two, but still too permissive --- hyperlink/_url.py | 70 ++++++++-------------------------------- hyperlink/_url_codecs.py | 60 ++++++++++++++++++++++++---------- 2 files changed, 56 insertions(+), 74 deletions(-) diff --git a/hyperlink/_url.py b/hyperlink/_url.py index 04ac4b71..61cec552 100644 --- a/hyperlink/_url.py +++ b/hyperlink/_url.py @@ -32,6 +32,7 @@ # Note: IDNAError is a subclass of UnicodeError from idna import encode as idna_encode, decode as idna_decode, IDNAError +from ._url_codecs import parse_host, URLParseError if inet_pton is None: # based on https://gist.github.com/nnemkin/4966028 @@ -419,13 +420,6 @@ def scheme_uses_netloc(scheme, default=None): return default -class URLParseError(ValueError): - """Exception inheriting from :exc:`ValueError`, raised when failing to - parse a URL. Mostly raised on invalid ports and IPv6 addresses. - """ - pass - - def _optional(argument, default): if argument is _UNSET: return default @@ -523,18 +517,22 @@ def _percent_decode(text, normalize_case=False, subencoding='utf-8', u'abc def' Args: - text (unicode): The ASCII text with percent-encoding present. + text (unicode): Text with percent-encoding present. normalize_case (bool): Whether undecoded percent segments, such as encoded delimiters, should be uppercased, per RFC 3986 Section 2.1. See :func:`_decode_path_part` for an example. + subencoding (unicode): The name of the encoding underlying the + percent-encoding. Pass `False` to get back bytes. + raise_subencoding_exc (bool): Whether an error in decoding the bytes + underlying the percent-decoding should be raised. Returns: - unicode: The percent-decoded version of *text*, with UTF-8 - decoding applied. + unicode: The percent-decoded version of *text*, with decoding + applied, unless `subencoding=False` which returns bytes. """ try: - quoted_bytes = text.encode("ascii") + quoted_bytes = text.encode(subencoding or 'utf-8') except UnicodeEncodeError: return text @@ -671,44 +669,6 @@ def _resolve_dot_segments(path): return segs -def parse_host(host): - """Parse the host into a tuple of ``(family, host)``, where family - is the appropriate :mod:`socket` module constant when the host is - an IP address. Family is ``None`` when the host is not an IP. - - Will raise :class:`URLParseError` on invalid IPv6 constants. - - Returns: - tuple: family (socket constant or None), host (string) - - >>> parse_host('googlewebsite.com') == (None, 'googlewebsite.com') - True - >>> parse_host('::1') == (socket.AF_INET6, '::1') - True - >>> parse_host('192.168.1.1') == (socket.AF_INET, '192.168.1.1') - True - """ - if not host: - return None, u'' - if u':' in host: - try: - inet_pton(socket.AF_INET6, host) - except socket.error as se: - raise URLParseError('invalid IPv6 host: %r (%r)' % (host, se)) - except UnicodeEncodeError: - pass # TODO: this can't be a real host right? - else: - family = socket.AF_INET6 - return family, host - try: - inet_pton(socket.AF_INET, host) - except (socket.error, UnicodeEncodeError): - family = None # not an IP - else: - family = socket.AF_INET - return family, host - - class URL(object): """From blogs to billboards, URLs are so common, that it's easy to overlook their complexity and power. With hyperlink's @@ -1673,8 +1633,7 @@ def path(self): return self._path except AttributeError: pass - self._path = tuple([_percent_decode(_encode_path_part(p), - raise_subencoding_exc=True) + self._path = tuple([_percent_decode(p, raise_subencoding_exc=True) for p in self._url.path]) return self._path @@ -1684,8 +1643,7 @@ def query(self): return self._query except AttributeError: pass - _q = [tuple(_percent_decode(_encode_query_part(x), - raise_subencoding_exc=True) + _q = [tuple(_percent_decode(x, raise_subencoding_exc=True) if x is not None else None for x in (k, v)) for k, v in self._url.query] @@ -1699,8 +1657,7 @@ def fragment(self): except AttributeError: pass frag = self._url.fragment - self._fragment = _percent_decode(_encode_fragment_part(frag), - raise_subencoding_exc=True) + self._fragment = _percent_decode(frag, raise_subencoding_exc=True) return self._fragment @property @@ -1709,8 +1666,7 @@ def userinfo(self): return self._userinfo except AttributeError: pass - self._userinfo = tuple([_percent_decode(_encode_userinfo_part(p), - raise_subencoding_exc=True) + self._userinfo = tuple([_percent_decode(p, raise_subencoding_exc=True) for p in self._url.userinfo.split(':', 1)]) return self._userinfo diff --git a/hyperlink/_url_codecs.py b/hyperlink/_url_codecs.py index c978f3b8..f58e16c1 100644 --- a/hyperlink/_url_codecs.py +++ b/hyperlink/_url_codecs.py @@ -2,6 +2,15 @@ import re import socket + +class URLParseError(ValueError): + """Exception inheriting from :exc:`ValueError`, raised when failing to + parse a URL. Mostly raised on invalid ports and IPv6 addresses. + """ + pass + +# TODO: fewer capturing groups + # RFC 3986 Section 2.3, Unreserved URI Characters # https://tools.ietf.org/html/rfc3986#section-2.3 _UNRESERVED_CHARS = frozenset('~-._0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ' @@ -16,7 +25,7 @@ # The following is based on Ian Cordasco's rfc3986 package -IPv4_PATT = '([0-9]{1,3}.){3}[0-9]{1,3}' +IPv4_PATT = '([0-9]{1,3}\.){3}[0-9]{1,3}' IPv4_RE = re.compile(IPv4_PATT) # Hexadecimal characters used in each piece of an IPv6 address HEXDIG_PATT = '[0-9A-Fa-f]{1,4}' @@ -64,7 +73,7 @@ ZONE_ID_PATT = '(?:[' + UNRESERVED_CHAR_PATT + ']|' + PERCENT_ENCODED_PATT + ')+' IPv6_ADDRZ_PATT = IPv6_PATT + '%25' + ZONE_ID_PATT -IP_LITERAL_PATT = ('\[(%s|(?:%s)|%s)\]' +IP_LITERAL_PATT = ('(%s|(?:%s)|%s)' % (IPv6_PATT, IPv6_ADDRZ_PATT, IPv_FUTURE_PATT)) @@ -72,25 +81,42 @@ def parse_host(host): + """Parse the host into a tuple of ``(family, host)``, where family + is the appropriate :mod:`socket` module constant when the host is + an IP address. Family is ``None`` when the host is not an IP. + + Will raise :class:`URLParseError` on invalid IPv6 constants. + + Returns: + tuple: family (socket constant or None), host (string) + + >>> parse_host('googlewebsite.com') == (None, 'googlewebsite.com') + True + >>> parse_host('::1') == (socket.AF_INET6, '::1') + True + >>> parse_host('192.168.1.1') == (socket.AF_INET, '192.168.1.1') + True + """ + if not host: + return None, u'' if u':' in host: - try: - _IP_LITERAL_RE.match(host) - # TODO: pull out lowest 32-bits in case of ipv4-in-ipv6 - # pattern match and inet_pton them - ipv4_match = IPv4_RE.search(host) - if ipv4_match: - try: - socket.inet_pton(socket.AF_INET, ipv4_match.group(0)) - except socket.error as se: - raise ValueError('invalid IPv6 host with IPv4: %r (%r)' % (host, se)) - except socket.error as se: - # TODO: URLParseError - raise ValueError('invalid IPv6 host: %r (%r)' % (host, se)) - else: - return socket.AF_INET6, host + ipv6_match = _IP_LITERAL_RE.match(host) + if ipv6_match is None: + raise URLParseError(u'invalid IPv6 host: %r' % host) + if host.startswith('2001'): + import pdb;pdb.set_trace() + ipv4_match = IPv4_RE.search(host) + if ipv4_match: + try: + socket.inet_pton(socket.AF_INET, ipv4_match.group(0)) + except socket.error as se: # socket.error _is_ OSError on Py3 + raise URLParseError(u'invalid IPv6 host with IPv4: %r' % host) + return socket.AF_INET6, host try: socket.inet_pton(socket.AF_INET, host) except (socket.error, UnicodeEncodeError): + # inet_pton raises socket.error on py2, OSError on py3 + # UnicodeEncodeError is only reached on non-ASCII unicode hosts family = None # not an IP else: family = socket.AF_INET From f2ffe627083894992d643cf4c2f49e73b2367f25 Mon Sep 17 00:00:00 2001 From: Mahmoud Hashemi Date: Sat, 13 Jan 2018 19:30:41 -0800 Subject: [PATCH 3/7] few fixes around parse_host, add a bunch of ipv6 test cases now that we're doing that in regex --- hyperlink/_url_codecs.py | 12 +- hyperlink/test/ipv6_test_cases.py | 489 ++++++++++++++++++++++++++++++ hyperlink/test/test_parse.py | 2 +- hyperlink/test/test_parse_host.py | 27 ++ 4 files changed, 523 insertions(+), 7 deletions(-) create mode 100644 hyperlink/test/ipv6_test_cases.py create mode 100644 hyperlink/test/test_parse_host.py diff --git a/hyperlink/_url_codecs.py b/hyperlink/_url_codecs.py index f58e16c1..5d95dc6f 100644 --- a/hyperlink/_url_codecs.py +++ b/hyperlink/_url_codecs.py @@ -25,15 +25,17 @@ class URLParseError(ValueError): # The following is based on Ian Cordasco's rfc3986 package +# TODO: This pattern isn't perfect, so we double check with inet_pton +# below, this will have to change for windows IPv4_PATT = '([0-9]{1,3}\.){3}[0-9]{1,3}' IPv4_RE = re.compile(IPv4_PATT) # Hexadecimal characters used in each piece of an IPv6 address HEXDIG_PATT = '[0-9A-Fa-f]{1,4}' # Least-significant 32 bits of an IPv6 address -LS32_RE = '(%(hex)s:%(hex)s|%(ipv4)s)' % {'hex': HEXDIG_PATT, 'ipv4': IPv4_PATT} +LS32_PATT = '(%(hex)s:%(hex)s|%(ipv4)s)' % {'hex': HEXDIG_PATT, 'ipv4': IPv4_PATT} # Substitutions into the following patterns for IPv6 patterns defined # http://tools.ietf.org/html/rfc3986#page-20 -_subs = {'hex': HEXDIG_PATT, 'ls32': LS32_RE} +_subs = {'hex': HEXDIG_PATT, 'ls32': LS32_PATT} # Below: h16 = hexdig, see: https://tools.ietf.org/html/rfc5234 for details # about ABNF (Augmented Backus-Naur Form) use in the comments @@ -73,7 +75,7 @@ class URLParseError(ValueError): ZONE_ID_PATT = '(?:[' + UNRESERVED_CHAR_PATT + ']|' + PERCENT_ENCODED_PATT + ')+' IPv6_ADDRZ_PATT = IPv6_PATT + '%25' + ZONE_ID_PATT -IP_LITERAL_PATT = ('(%s|(?:%s)|%s)' +IP_LITERAL_PATT = ('^(%s|(?:%s)|%s)\Z' % (IPv6_PATT, IPv6_ADDRZ_PATT, IPv_FUTURE_PATT)) @@ -103,13 +105,11 @@ def parse_host(host): ipv6_match = _IP_LITERAL_RE.match(host) if ipv6_match is None: raise URLParseError(u'invalid IPv6 host: %r' % host) - if host.startswith('2001'): - import pdb;pdb.set_trace() ipv4_match = IPv4_RE.search(host) if ipv4_match: try: socket.inet_pton(socket.AF_INET, ipv4_match.group(0)) - except socket.error as se: # socket.error _is_ OSError on Py3 + except socket.error: # NB: socket.error _is_ OSError on Py3 raise URLParseError(u'invalid IPv6 host with IPv4: %r' % host) return socket.AF_INET6, host try: diff --git a/hyperlink/test/ipv6_test_cases.py b/hyperlink/test/ipv6_test_cases.py new file mode 100644 index 00000000..46aa1e1b --- /dev/null +++ b/hyperlink/test/ipv6_test_cases.py @@ -0,0 +1,489 @@ + +# The following test cases are based on the suite made available by +# Dartware, made available under the following Creative Commons license: + +# LICENSE +# +# IPv6 Regex by Dartware, LLC is licensed under a +# Creative Commons Attribution-ShareAlike 3.0 Unported License. +# http://creativecommons.org/licenses/by-sa/3.0/ +# +# Please mention Dartware and provide a link back to our site +# in the documentation with other attributions. It should say, +# +# --- +# IPv6 regular expression courtesy of Dartware, LLC (http://intermapper.com) +# For full details see http://intermapper.com/ipv6regex +# --- + + +DW_IPv6_TEST_CASES = \ +[{'heading': 'IPv4 addresses as dotted-quads', + 'notes': '', + 'tests': [('1:2:3:4:5:6:1.2.3.4', True, ''), + ('1:2:3:4:5::1.2.3.4', True, ''), + ('1:2:3:4::1.2.3.4', True, ''), + ('1:2:3::1.2.3.4', True, ''), + ('1:2::1.2.3.4', True, ''), + ('1::1.2.3.4', True, ''), + ('1:2:3:4::5:1.2.3.4', True, ''), + ('1:2:3::5:1.2.3.4', True, ''), + ('1:2::5:1.2.3.4', True, ''), + ('1::5:1.2.3.4', True, ''), + ('1::5:11.22.33.44', True, ''), + ('1::5:400.2.3.4', False, ''), + ('1::5:260.2.3.4', False, ''), + ('1::5:256.2.3.4', False, ''), + ('1::5:1.256.3.4', False, ''), + ('1::5:1.2.256.4', False, ''), + ('1::5:1.2.3.256', False, ''), + ('1::5:300.2.3.4', False, ''), + ('1::5:1.300.3.4', False, ''), + ('1::5:1.2.300.4', False, ''), + ('1::5:1.2.3.300', False, ''), + ('1::5:900.2.3.4', False, ''), + ('1::5:1.900.3.4', False, ''), + ('1::5:1.2.900.4', False, ''), + ('1::5:1.2.3.900', False, ''), + ('1::5:300.300.300.300', False, ''), + ('1::5:3000.30.30.30', False, ''), + ('1::400.2.3.4', False, ''), + ('1::260.2.3.4', False, ''), + ('1::256.2.3.4', False, ''), + ('1::1.256.3.4', False, ''), + ('1::1.2.256.4', False, ''), + ('1::1.2.3.256', False, ''), + ('1::300.2.3.4', False, ''), + ('1::1.300.3.4', False, ''), + ('1::1.2.300.4', False, ''), + ('1::1.2.3.300', False, ''), + ('1::900.2.3.4', False, ''), + ('1::1.900.3.4', False, ''), + ('1::1.2.900.4', False, ''), + ('1::1.2.3.900', False, ''), + ('1::300.300.300.300', False, ''), + ('1::3000.30.30.30', False, ''), + ('::400.2.3.4', False, ''), + ('::260.2.3.4', False, ''), + ('::256.2.3.4', False, ''), + ('::1.256.3.4', False, ''), + ('::1.2.256.4', False, ''), + ('::1.2.3.256', False, ''), + ('::300.2.3.4', False, ''), + ('::1.300.3.4', False, ''), + ('::1.2.300.4', False, ''), + ('::1.2.3.300', False, ''), + ('::900.2.3.4', False, ''), + ('::1.900.3.4', False, ''), + ('::1.2.900.4', False, ''), + ('::1.2.3.900', False, ''), + ('::300.300.300.300', False, ''), + ('::3000.30.30.30', False, ''), + ('fe80::217:f2ff:254.7.237.98', True, ''), + ('::ffff:192.168.1.26', True, ''), + ('2001:1:1:1:1:1:255Z255X255Y255', False, 'garbage instead of "." in IPv4'), + ('::ffff:192x168.1.26', False, 'ditto'), + ('::ffff:192.168.1.1', True, ''), + ('0:0:0:0:0:0:13.1.68.3', True, 'IPv4-compatible IPv6 address, full, deprecated'), + ('0:0:0:0:0:FFFF:129.144.52.38', True, 'IPv4-mapped IPv6 address, full'), + ('::13.1.68.3', True, 'IPv4-compatible IPv6 address, compressed, deprecated'), + ('::FFFF:129.144.52.38', True, 'IPv4-mapped IPv6 address, compressed'), + ('fe80:0:0:0:204:61ff:254.157.241.86', True, ''), + ('fe80::204:61ff:254.157.241.86', True, ''), + ('::ffff:12.34.56.78', True, ''), + ('::ffff:2.3.4', False, ''), + ('::ffff:257.1.2.3', False, ''), + ('1.2.3.4:1111:2222:3333:4444::5555', False, 'Aeron'), + ('1.2.3.4:1111:2222:3333::5555', False, ''), + ('1.2.3.4:1111:2222::5555', False, ''), + ('1.2.3.4:1111::5555', False, ''), + ('1.2.3.4::5555', False, ''), + ('1.2.3.4::', False, '')]}, + {'heading': 'Testing IPv4 addresses represented as dotted-quads', + 'notes': 'Leading zero\'s in IPv4 addresses not allowed: some systems treat the leading "0" in ".086" as the start of an octal number Update: The BNF in RFC-3986 explicitly defines the dec-octet (for IPv4 addresses) not to have a leading zero ', + 'tests': [('fe80:0000:0000:0000:0204:61ff:254.157.241.086', False, ''), + ('::ffff:192.0.2.128', True, "but this is OK, since there's a single digit"), + ('XXXX:XXXX:XXXX:XXXX:XXXX:XXXX:1.2.3.4', False, ''), + ('1111:2222:3333:4444:5555:6666:00.00.00.00', False, ''), + ('1111:2222:3333:4444:5555:6666:000.000.000.000', False, ''), + ('1111:2222:3333:4444:5555:6666:256.256.256.256', False, '')]}, + {'heading': 'Not testing address with subnet mask', + 'notes': '', + 'tests': [('fe80:0000:0000:0000:0204:61ff:fe9d:f156', True, ''), + ('fe80:0:0:0:204:61ff:fe9d:f156', True, ''), + ('fe80::204:61ff:fe9d:f156', True, ''), + ('::1', True, ''), + ('fe80::', True, ''), + ('fe80::1', True, ''), + (':', False, ''), + ('::ffff:c000:280', True, '')]}, + {'heading': 'Aeron supplied these test cases', + 'notes': '', + 'tests': [('1111:2222:3333:4444::5555:', False, ''), + ('1111:2222:3333::5555:', False, ''), + ('1111:2222::5555:', False, ''), + ('1111::5555:', False, ''), + ('::5555:', False, ''), + (':::', False, ''), + ('1111:', False, ''), + (':', False, ''), + (':1111:2222:3333:4444::5555', False, ''), + (':1111:2222:3333::5555', False, ''), + (':1111:2222::5555', False, ''), + (':1111::5555', False, ''), + (':::5555', False, ''), + (':::', False, '')]}, + {'heading': 'Additional test cases', + 'notes': 'from http://rt.cpan.org/Public/Bug/Display.html?id=50693 ', + 'tests': [('2001:0db8:85a3:0000:0000:8a2e:0370:7334', True, ''), + ('2001:db8:85a3:0:0:8a2e:370:7334', True, ''), + ('2001:db8:85a3::8a2e:370:7334', True, ''), + ('2001:0db8:0000:0000:0000:0000:1428:57ab', True, ''), + ('2001:0db8:0000:0000:0000::1428:57ab', True, ''), + ('2001:0db8:0:0:0:0:1428:57ab', True, ''), + ('2001:0db8:0:0::1428:57ab', True, ''), + ('2001:0db8::1428:57ab', True, ''), + ('2001:db8::1428:57ab', True, ''), + ('0000:0000:0000:0000:0000:0000:0000:0001', True, ''), + ('::1', True, ''), + ('::ffff:0c22:384e', True, ''), + ('2001:0db8:1234:0000:0000:0000:0000:0000', True, ''), + ('2001:0db8:1234:ffff:ffff:ffff:ffff:ffff', True, ''), + ('2001:db8:a::123', True, ''), + ('fe80::', True, ''), + ('123', False, ''), + ('ldkfj', False, ''), + ('2001::FFD3::57ab', False, ''), + ('2001:db8:85a3::8a2e:37023:7334', False, ''), + ('2001:db8:85a3::8a2e:370k:7334', False, ''), + ('1:2:3:4:5:6:7:8:9', False, ''), + ('1::2::3', False, ''), + ('1:::3:4:5', False, ''), + ('1:2:3::4:5:6:7:8:9', False, '')]}, + {'heading': 'New from Aeron', + 'notes': '', + 'tests': [('1111:2222:3333:4444:5555:6666:7777:8888', True, ''), + ('1111:2222:3333:4444:5555:6666:7777::', True, ''), + ('1111:2222:3333:4444:5555:6666::', True, ''), + ('1111:2222:3333:4444:5555::', True, ''), + ('1111:2222:3333:4444::', True, ''), + ('1111:2222:3333::', True, ''), + ('1111:2222::', True, ''), + ('1111::', True, ''), + ('1111:2222:3333:4444:5555:6666::8888', True, ''), + ('1111:2222:3333:4444:5555::8888', True, ''), + ('1111:2222:3333:4444::8888', True, ''), + ('1111:2222:3333::8888', True, ''), + ('1111:2222::8888', True, ''), + ('1111::8888', True, ''), + ('::8888', True, ''), + ('1111:2222:3333:4444:5555::7777:8888', True, ''), + ('1111:2222:3333:4444::7777:8888', True, ''), + ('1111:2222:3333::7777:8888', True, ''), + ('1111:2222::7777:8888', True, ''), + ('1111::7777:8888', True, ''), + ('::7777:8888', True, ''), + ('1111:2222:3333:4444::6666:7777:8888', True, ''), + ('1111:2222:3333::6666:7777:8888', True, ''), + ('1111:2222::6666:7777:8888', True, ''), + ('1111::6666:7777:8888', True, ''), + ('::6666:7777:8888', True, ''), + ('1111:2222:3333::5555:6666:7777:8888', True, ''), + ('1111:2222::5555:6666:7777:8888', True, ''), + ('1111::5555:6666:7777:8888', True, ''), + ('::5555:6666:7777:8888', True, ''), + ('1111:2222::4444:5555:6666:7777:8888', True, ''), + ('1111::4444:5555:6666:7777:8888', True, ''), + ('::4444:5555:6666:7777:8888', True, ''), + ('1111::3333:4444:5555:6666:7777:8888', True, ''), + ('::3333:4444:5555:6666:7777:8888', True, ''), + ('::2222:3333:4444:5555:6666:7777:8888', True, ''), + ('1111:2222:3333:4444:5555:6666:123.123.123.123', True, ''), + ('1111:2222:3333:4444:5555::123.123.123.123', True, ''), + ('1111:2222:3333:4444::123.123.123.123', True, ''), + ('1111:2222:3333::123.123.123.123', True, ''), + ('1111:2222::123.123.123.123', True, ''), + ('1111::123.123.123.123', True, ''), + ('::123.123.123.123', True, ''), + ('1111:2222:3333:4444::6666:123.123.123.123', True, ''), + ('1111:2222:3333::6666:123.123.123.123', True, ''), + ('1111:2222::6666:123.123.123.123', True, ''), + ('1111::6666:123.123.123.123', True, ''), + ('::6666:123.123.123.123', True, ''), + ('1111:2222:3333::5555:6666:123.123.123.123', True, ''), + ('1111:2222::5555:6666:123.123.123.123', True, ''), + ('1111::5555:6666:123.123.123.123', True, ''), + ('::5555:6666:123.123.123.123', True, ''), + ('1111:2222::4444:5555:6666:123.123.123.123', True, ''), + ('1111::4444:5555:6666:123.123.123.123', True, ''), + ('::4444:5555:6666:123.123.123.123', True, ''), + ('1111::3333:4444:5555:6666:123.123.123.123', True, ''), + ('::2222:3333:4444:5555:6666:123.123.123.123', True, '')]}, + {'heading': 'Playing with combinations of "0" and "::"', + 'notes': 'NB: these are all sytactically correct, but are bad form because "0" adjacent to "::" should be combined into "::" ', + 'tests': [('::0:0:0:0:0:0:0', True, ''), + ('::0:0:0:0:0:0', True, ''), + ('::0:0:0:0:0', True, ''), + ('::0:0:0:0', True, ''), + ('::0:0:0', True, ''), + ('::0:0', True, ''), + ('::0', True, ''), + ('0:0:0:0:0:0:0::', True, ''), + ('0:0:0:0:0:0::', True, ''), + ('0:0:0:0:0::', True, ''), + ('0:0:0:0::', True, ''), + ('0:0:0::', True, ''), + ('0:0::', True, ''), + ('0::', True, '')]}, + {'heading': 'New invalid from Aeron', + 'notes': 'Invalid data ', + 'tests': [('XXXX:XXXX:XXXX:XXXX:XXXX:XXXX:XXXX:XXXX', False, '')]}, + {'heading': 'Too many components', + 'notes': '', + 'tests': [('1111:2222:3333:4444:5555:6666:7777:8888:9999', False, ''), + ('1111:2222:3333:4444:5555:6666:7777:8888::', False, ''), + ('::2222:3333:4444:5555:6666:7777:8888:9999', False, '')]}, + {'heading': 'Too few components', + 'notes': '', + 'tests': [('1111:2222:3333:4444:5555:6666:7777', False, ''), + ('1111:2222:3333:4444:5555:6666', False, ''), + ('1111:2222:3333:4444:5555', False, ''), + ('1111:2222:3333:4444', False, ''), + ('1111:2222:3333', False, ''), + ('1111:2222', False, ''), + ('1111', False, '')]}, + {'heading': 'Missing :', + 'notes': '', + 'tests': [('11112222:3333:4444:5555:6666:7777:8888', False, ''), + ('1111:22223333:4444:5555:6666:7777:8888', False, ''), + ('1111:2222:33334444:5555:6666:7777:8888', False, ''), + ('1111:2222:3333:44445555:6666:7777:8888', False, ''), + ('1111:2222:3333:4444:55556666:7777:8888', False, ''), + ('1111:2222:3333:4444:5555:66667777:8888', False, ''), + ('1111:2222:3333:4444:5555:6666:77778888', False, '')]}, + {'heading': 'Missing : intended for ::', + 'notes': '', + 'tests': [('1111:2222:3333:4444:5555:6666:7777:8888:', False, ''), + ('1111:2222:3333:4444:5555:6666:7777:', False, ''), + ('1111:2222:3333:4444:5555:6666:', False, ''), + ('1111:2222:3333:4444:5555:', False, ''), + ('1111:2222:3333:4444:', False, ''), + ('1111:2222:3333:', False, ''), + ('1111:2222:', False, ''), + ('1111:', False, ''), + (':', False, ''), + (':8888', False, ''), + (':7777:8888', False, ''), + (':6666:7777:8888', False, ''), + (':5555:6666:7777:8888', False, ''), + (':4444:5555:6666:7777:8888', False, ''), + (':3333:4444:5555:6666:7777:8888', False, ''), + (':2222:3333:4444:5555:6666:7777:8888', False, ''), + (':1111:2222:3333:4444:5555:6666:7777:8888', False, '')]}, + {'heading': ':::', + 'notes': '', + 'tests': [(':::2222:3333:4444:5555:6666:7777:8888', False, ''), + ('1111:::3333:4444:5555:6666:7777:8888', False, ''), + ('1111:2222:::4444:5555:6666:7777:8888', False, ''), + ('1111:2222:3333:::5555:6666:7777:8888', False, ''), + ('1111:2222:3333:4444:::6666:7777:8888', False, ''), + ('1111:2222:3333:4444:5555:::7777:8888', False, ''), + ('1111:2222:3333:4444:5555:6666:::8888', False, ''), + ('1111:2222:3333:4444:5555:6666:7777:::', False, '')]}, + {'heading': 'Double ::");', + 'notes': '', + 'tests': [('::2222::4444:5555:6666:7777:8888', False, ''), + ('::2222:3333::5555:6666:7777:8888', False, ''), + ('::2222:3333:4444::6666:7777:8888', False, ''), + ('::2222:3333:4444:5555::7777:8888', False, ''), + ('::2222:3333:4444:5555:7777::8888', False, ''), + ('::2222:3333:4444:5555:7777:8888::', False, ''), + ('1111::3333::5555:6666:7777:8888', False, ''), + ('1111::3333:4444::6666:7777:8888', False, ''), + ('1111::3333:4444:5555::7777:8888', False, ''), + ('1111::3333:4444:5555:6666::8888', False, ''), + ('1111::3333:4444:5555:6666:7777::', False, ''), + ('1111:2222::4444::6666:7777:8888', False, ''), + ('1111:2222::4444:5555::7777:8888', False, ''), + ('1111:2222::4444:5555:6666::8888', False, ''), + ('1111:2222::4444:5555:6666:7777::', False, ''), + ('1111:2222:3333::5555::7777:8888', False, ''), + ('1111:2222:3333::5555:6666::8888', False, ''), + ('1111:2222:3333::5555:6666:7777::', False, ''), + ('1111:2222:3333:4444::6666::8888', False, ''), + ('1111:2222:3333:4444::6666:7777::', False, ''), + ('1111:2222:3333:4444:5555::7777::', False, '')]}, + {'heading': 'Too many components"', + 'notes': '', + 'tests': [('1111:2222:3333:4444:5555:6666:7777:8888:1.2.3.4', False, ''), + ('1111:2222:3333:4444:5555:6666:7777:1.2.3.4', False, ''), + ('1111:2222:3333:4444:5555:6666::1.2.3.4', False, ''), + ('::2222:3333:4444:5555:6666:7777:1.2.3.4', False, ''), + ('1111:2222:3333:4444:5555:6666:1.2.3.4.5', False, '')]}, + {'heading': 'Too few components', + 'notes': '', + 'tests': [('1111:2222:3333:4444:5555:1.2.3.4', False, ''), + ('1111:2222:3333:4444:1.2.3.4', False, ''), + ('1111:2222:3333:1.2.3.4', False, ''), + ('1111:2222:1.2.3.4', False, ''), + ('1111:1.2.3.4', False, ''), + ('1.2.3.4', False, '')]}, + {'heading': 'Missing :', + 'notes': '', + 'tests': [('11112222:3333:4444:5555:6666:1.2.3.4', False, ''), + ('1111:22223333:4444:5555:6666:1.2.3.4', False, ''), + ('1111:2222:33334444:5555:6666:1.2.3.4', False, ''), + ('1111:2222:3333:44445555:6666:1.2.3.4', False, ''), + ('1111:2222:3333:4444:55556666:1.2.3.4', False, ''), + ('1111:2222:3333:4444:5555:66661.2.3.4', False, '')]}, + {'heading': 'Missing .', + 'notes': '', + 'tests': [('1111:2222:3333:4444:5555:6666:255255.255.255', False, ''), + ('1111:2222:3333:4444:5555:6666:255.255255.255', False, ''), + ('1111:2222:3333:4444:5555:6666:255.255.255255', False, '')]}, + {'heading': 'Missing : intended for ::', + 'notes': '', + 'tests': [(':1.2.3.4', False, ''), + (':6666:1.2.3.4', False, ''), + (':5555:6666:1.2.3.4', False, ''), + (':4444:5555:6666:1.2.3.4', False, ''), + (':3333:4444:5555:6666:1.2.3.4', False, ''), + (':2222:3333:4444:5555:6666:1.2.3.4', False, ''), + (':1111:2222:3333:4444:5555:6666:1.2.3.4', False, '')]}, + {'heading': ':::', + 'notes': '', + 'tests': [(':::2222:3333:4444:5555:6666:1.2.3.4', False, ''), + ('1111:::3333:4444:5555:6666:1.2.3.4', False, ''), + ('1111:2222:::4444:5555:6666:1.2.3.4', False, ''), + ('1111:2222:3333:::5555:6666:1.2.3.4', False, ''), + ('1111:2222:3333:4444:::6666:1.2.3.4', False, ''), + ('1111:2222:3333:4444:5555:::1.2.3.4', False, '')]}, + {'heading': 'Double ::', + 'notes': '', + 'tests': [('::2222::4444:5555:6666:1.2.3.4', False, ''), + ('::2222:3333::5555:6666:1.2.3.4', False, ''), + ('::2222:3333:4444::6666:1.2.3.4', False, ''), + ('::2222:3333:4444:5555::1.2.3.4', False, ''), + ('1111::3333::5555:6666:1.2.3.4', False, ''), + ('1111::3333:4444::6666:1.2.3.4', False, ''), + ('1111::3333:4444:5555::1.2.3.4', False, ''), + ('1111:2222::4444::6666:1.2.3.4', False, ''), + ('1111:2222::4444:5555::1.2.3.4', False, ''), + ('1111:2222:3333::5555::1.2.3.4', False, '')]}, + {'heading': 'Missing parts', + 'notes': '', + 'tests': [('::.', False, ''), + ('::..', False, ''), + ('::...', False, ''), + ('::1...', False, ''), + ('::1.2..', False, ''), + ('::1.2.3.', False, ''), + ('::.2..', False, ''), + ('::.2.3.', False, ''), + ('::.2.3.4', False, ''), + ('::..3.', False, ''), + ('::..3.4', False, ''), + ('::...4', False, '')]}, + {'heading': 'Extra : in front', + 'notes': '', + 'tests': [(':1111:2222:3333:4444:5555:6666:7777::', False, ''), + (':1111:2222:3333:4444:5555:6666::', False, ''), + (':1111:2222:3333:4444:5555::', False, ''), + (':1111:2222:3333:4444::', False, ''), + (':1111:2222:3333::', False, ''), + (':1111:2222::', False, ''), + (':1111::', False, ''), + (':::', False, ''), + (':1111:2222:3333:4444:5555:6666::8888', False, ''), + (':1111:2222:3333:4444:5555::8888', False, ''), + (':1111:2222:3333:4444::8888', False, ''), + (':1111:2222:3333::8888', False, ''), + (':1111:2222::8888', False, ''), + (':1111::8888', False, ''), + (':::8888', False, ''), + (':1111:2222:3333:4444:5555::7777:8888', False, ''), + (':1111:2222:3333:4444::7777:8888', False, ''), + (':1111:2222:3333::7777:8888', False, ''), + (':1111:2222::7777:8888', False, ''), + (':1111::7777:8888', False, ''), + (':::7777:8888', False, ''), + (':1111:2222:3333:4444::6666:7777:8888', False, ''), + (':1111:2222:3333::6666:7777:8888', False, ''), + (':1111:2222::6666:7777:8888', False, ''), + (':1111::6666:7777:8888', False, ''), + (':::6666:7777:8888', False, ''), + (':1111:2222:3333::5555:6666:7777:8888', False, ''), + (':1111:2222::5555:6666:7777:8888', False, ''), + (':1111::5555:6666:7777:8888', False, ''), + (':::5555:6666:7777:8888', False, ''), + (':1111:2222::4444:5555:6666:7777:8888', False, ''), + (':1111::4444:5555:6666:7777:8888', False, ''), + (':::4444:5555:6666:7777:8888', False, ''), + (':1111::3333:4444:5555:6666:7777:8888', False, ''), + (':::3333:4444:5555:6666:7777:8888', False, ''), + (':::2222:3333:4444:5555:6666:7777:8888', False, ''), + (':1111:2222:3333:4444:5555:6666:1.2.3.4', False, ''), + (':1111:2222:3333:4444:5555::1.2.3.4', False, ''), + (':1111:2222:3333:4444::1.2.3.4', False, ''), + (':1111:2222:3333::1.2.3.4', False, ''), + (':1111:2222::1.2.3.4', False, ''), + (':1111::1.2.3.4', False, ''), + (':::1.2.3.4', False, ''), + (':1111:2222:3333:4444::6666:1.2.3.4', False, ''), + (':1111:2222:3333::6666:1.2.3.4', False, ''), + (':1111:2222::6666:1.2.3.4', False, ''), + (':1111::6666:1.2.3.4', False, ''), + (':::6666:1.2.3.4', False, ''), + (':1111:2222:3333::5555:6666:1.2.3.4', False, ''), + (':1111:2222::5555:6666:1.2.3.4', False, ''), + (':1111::5555:6666:1.2.3.4', False, ''), + (':::5555:6666:1.2.3.4', False, ''), + (':1111:2222::4444:5555:6666:1.2.3.4', False, ''), + (':1111::4444:5555:6666:1.2.3.4', False, ''), + (':::4444:5555:6666:1.2.3.4', False, ''), + (':1111::3333:4444:5555:6666:1.2.3.4', False, ''), + (':::2222:3333:4444:5555:6666:1.2.3.4', False, '')]}, + {'heading': 'Extra : at end', + 'notes': '', + 'tests': [('1111:2222:3333:4444:5555:6666:7777:::', False, ''), + ('1111:2222:3333:4444:5555:6666:::', False, ''), + ('1111:2222:3333:4444:5555:::', False, ''), + ('1111:2222:3333:4444:::', False, ''), + ('1111:2222:3333:::', False, ''), + ('1111:2222:::', False, ''), + ('1111:::', False, ''), + (':::', False, ''), + ('1111:2222:3333:4444:5555:6666::8888:', False, ''), + ('1111:2222:3333:4444:5555::8888:', False, ''), + ('1111:2222:3333:4444::8888:', False, ''), + ('1111:2222:3333::8888:', False, ''), + ('1111:2222::8888:', False, ''), + ('1111::8888:', False, ''), + ('::8888:', False, ''), + ('1111:2222:3333:4444:5555::7777:8888:', False, ''), + ('1111:2222:3333:4444::7777:8888:', False, ''), + ('1111:2222:3333::7777:8888:', False, ''), + ('1111:2222::7777:8888:', False, ''), + ('1111::7777:8888:', False, ''), + ('::7777:8888:', False, ''), + ('1111:2222:3333:4444::6666:7777:8888:', False, ''), + ('1111:2222:3333::6666:7777:8888:', False, ''), + ('1111:2222::6666:7777:8888:', False, ''), + ('1111::6666:7777:8888:', False, ''), + ('::6666:7777:8888:', False, ''), + ('1111:2222:3333::5555:6666:7777:8888:', False, ''), + ('1111:2222::5555:6666:7777:8888:', False, ''), + ('1111::5555:6666:7777:8888:', False, ''), + ('::5555:6666:7777:8888:', False, ''), + ('1111:2222::4444:5555:6666:7777:8888:', False, ''), + ('1111::4444:5555:6666:7777:8888:', False, ''), + ('::4444:5555:6666:7777:8888:', False, ''), + ('1111::3333:4444:5555:6666:7777:8888:', False, ''), + ('::3333:4444:5555:6666:7777:8888:', False, ''), + ('::2222:3333:4444:5555:6666:7777:8888:', False, '')]}, + {'heading': 'Additional cases: http://crisp.tweakblogs.net/blog/2031/ipv6-validation-%28and-caveats%29.html', + 'notes': '', + 'tests': [('0:a:b:c:d:e:f::', True, ''), + ('::0:a:b:c:d:e:f', True, 'syntactically correct, but bad form (::0:... could be combined)'), + ('a:b:c:d:e:f:0::', True, ''), + ("':10.0.0.1", False, '')]}] diff --git a/hyperlink/test/test_parse.py b/hyperlink/test/test_parse.py index cd2e9c97..ee8a9a21 100644 --- a/hyperlink/test/test_parse.py +++ b/hyperlink/test/test_parse.py @@ -12,7 +12,7 @@ # invalid utf8 -class TestURL(HyperlinkTestCase): +class TestParse(HyperlinkTestCase): def test_parse(self): purl = parse(TOTAL_URL) assert isinstance(purl, DecodedURL) diff --git a/hyperlink/test/test_parse_host.py b/hyperlink/test/test_parse_host.py new file mode 100644 index 00000000..75f5ff07 --- /dev/null +++ b/hyperlink/test/test_parse_host.py @@ -0,0 +1,27 @@ + +import socket + +from hyperlink import _url_codecs + +from .common import HyperlinkTestCase +from .ipv6_test_cases import DW_IPv6_TEST_CASES + + +class TestParseHost(HyperlinkTestCase): + def test_parse_host_dw_ipv6(self): + for group in DW_IPv6_TEST_CASES: + for ip_text, is_valid, _ in group['tests']: + if is_valid: + family, host = _url_codecs.parse_host(ip_text) + assert family == socket.AF_INET6 + assert ip_text == host + continue + + with self.assertRaises(_url_codecs.URLParseError): + family, _ = _url_codecs.parse_host(ip_text) + # in cases where an error isn't raised, we + # check that we parsed something other than + # ipv6 and make the necessary correction + if family != socket.AF_INET6: + raise _url_codecs.URLParseError + return From c2cea602c9111af80d3ea37b7e00a56b218270fa Mon Sep 17 00:00:00 2001 From: Mahmoud Hashemi Date: Sat, 13 Jan 2018 20:05:39 -0800 Subject: [PATCH 4/7] WIP: progress on moving away from inet_pton, but inet_aton is way too permissive. And our IPv4 regex isn't doing us any favors, either. Now to face the question of whether that which is valid by socket.create_connection is valid in a URL, or if we should just switch to a full-regex approach, no socket modules --- hyperlink/_url_codecs.py | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/hyperlink/_url_codecs.py b/hyperlink/_url_codecs.py index 5d95dc6f..e5e3a8f7 100644 --- a/hyperlink/_url_codecs.py +++ b/hyperlink/_url_codecs.py @@ -25,10 +25,11 @@ class URLParseError(ValueError): # The following is based on Ian Cordasco's rfc3986 package -# TODO: This pattern isn't perfect, so we double check with inet_pton +# TODO: This pattern isn't perfect, so we double check with inet_aton # below, this will have to change for windows IPv4_PATT = '([0-9]{1,3}\.){3}[0-9]{1,3}' -IPv4_RE = re.compile(IPv4_PATT) +IPv4_PART_RE = re.compile(IPv4_PATT) +IPv4_RE = re.compile('^' + IPv4_PATT + '\Z') # Hexadecimal characters used in each piece of an IPv6 address HEXDIG_PATT = '[0-9A-Fa-f]{1,4}' # Least-significant 32 bits of an IPv6 address @@ -105,19 +106,24 @@ def parse_host(host): ipv6_match = _IP_LITERAL_RE.match(host) if ipv6_match is None: raise URLParseError(u'invalid IPv6 host: %r' % host) - ipv4_match = IPv4_RE.search(host) + ipv4_match = IPv4_PART_RE.search(host) if ipv4_match: try: - socket.inet_pton(socket.AF_INET, ipv4_match.group(0)) + socket.inet_aton(ipv4_match.group(0)) except socket.error: # NB: socket.error _is_ OSError on Py3 raise URLParseError(u'invalid IPv6 host with IPv4: %r' % host) return socket.AF_INET6, host - try: - socket.inet_pton(socket.AF_INET, host) - except (socket.error, UnicodeEncodeError): - # inet_pton raises socket.error on py2, OSError on py3 - # UnicodeEncodeError is only reached on non-ASCII unicode hosts - family = None # not an IP - else: - family = socket.AF_INET + # This is necessary because inet_aton takes non-quad inputs see + # the man page for inet + family = None + ipv4_match = IPv4_RE.search(host) + if ipv4_match: + try: + socket.inet_aton(host) + except (socket.error, UnicodeEncodeError): + # inet_aton raises socket.error on py2, OSError on py3 + # UnicodeEncodeError is only reached on non-ASCII unicode hosts + pass # regular domain/host name, needs resolution + else: + family = socket.AF_INET return family, host From 087f2f9d4633283f0c8b8c30e6d0be822dc1cf57 Mon Sep 17 00:00:00 2001 From: Mahmoud Hashemi Date: Sat, 13 Jan 2018 20:19:41 -0800 Subject: [PATCH 5/7] and now all the tests are passing using the regex-only approach. need more negative tests now that we're not relying on the socket module, but this is something. --- hyperlink/_url_codecs.py | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/hyperlink/_url_codecs.py b/hyperlink/_url_codecs.py index e5e3a8f7..d0447c50 100644 --- a/hyperlink/_url_codecs.py +++ b/hyperlink/_url_codecs.py @@ -23,13 +23,15 @@ class URLParseError(ValueError): _ALL_DELIMS = _GEN_DELIMS | _SUB_DELIMS -# The following is based on Ian Cordasco's rfc3986 package -# TODO: This pattern isn't perfect, so we double check with inet_aton -# below, this will have to change for windows -IPv4_PATT = '([0-9]{1,3}\.){3}[0-9]{1,3}' + +IPv4_PATT = ("(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}" + "(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])") IPv4_PART_RE = re.compile(IPv4_PATT) IPv4_RE = re.compile('^' + IPv4_PATT + '\Z') + +# The following is based on Ian Cordasco's rfc3986 package + # Hexadecimal characters used in each piece of an IPv6 address HEXDIG_PATT = '[0-9A-Fa-f]{1,4}' # Least-significant 32 bits of an IPv6 address @@ -106,24 +108,13 @@ def parse_host(host): ipv6_match = _IP_LITERAL_RE.match(host) if ipv6_match is None: raise URLParseError(u'invalid IPv6 host: %r' % host) - ipv4_match = IPv4_PART_RE.search(host) - if ipv4_match: - try: - socket.inet_aton(ipv4_match.group(0)) - except socket.error: # NB: socket.error _is_ OSError on Py3 + if '.' in host: + ipv4_match = IPv4_PART_RE.search(host) + if not ipv4_match: raise URLParseError(u'invalid IPv6 host with IPv4: %r' % host) return socket.AF_INET6, host - # This is necessary because inet_aton takes non-quad inputs see - # the man page for inet family = None ipv4_match = IPv4_RE.search(host) if ipv4_match: - try: - socket.inet_aton(host) - except (socket.error, UnicodeEncodeError): - # inet_aton raises socket.error on py2, OSError on py3 - # UnicodeEncodeError is only reached on non-ASCII unicode hosts - pass # regular domain/host name, needs resolution - else: - family = socket.AF_INET + family = socket.AF_INET return family, host From 476b99f985e0b9d082cff59c1648e1635fff3cba Mon Sep 17 00:00:00 2001 From: Mahmoud Hashemi Date: Sun, 14 Jan 2018 17:43:33 -0800 Subject: [PATCH 6/7] move all socket parsing/conditional inet_pton importing into tests package, total coverage now up to 98% --- hyperlink/_url.py | 36 ------------------------------- hyperlink/test/common.py | 33 ++++++++++++++++++++++++++++ hyperlink/test/test_parse_host.py | 7 ++++-- hyperlink/test/test_url.py | 4 ++-- 4 files changed, 40 insertions(+), 40 deletions(-) diff --git a/hyperlink/_url.py b/hyperlink/_url.py index 61cec552..7f3e6054 100644 --- a/hyperlink/_url.py +++ b/hyperlink/_url.py @@ -18,12 +18,7 @@ import re import sys import string -import socket from unicodedata import normalize -try: - from socket import inet_pton -except ImportError: - inet_pton = None # defined below try: from collections.abc import Mapping except ImportError: # Python 2 @@ -34,37 +29,6 @@ from ._url_codecs import parse_host, URLParseError -if inet_pton is None: - # based on https://gist.github.com/nnemkin/4966028 - # this code only applies on Windows Python 2.7 - import ctypes - - class _sockaddr(ctypes.Structure): - _fields_ = [("sa_family", ctypes.c_short), - ("__pad1", ctypes.c_ushort), - ("ipv4_addr", ctypes.c_byte * 4), - ("ipv6_addr", ctypes.c_byte * 16), - ("__pad2", ctypes.c_ulong)] - - WSAStringToAddressA = ctypes.windll.ws2_32.WSAStringToAddressA - WSAAddressToStringA = ctypes.windll.ws2_32.WSAAddressToStringA - - def inet_pton(address_family, ip_string): - addr = _sockaddr() - ip_string = ip_string.encode('ascii') - addr.sa_family = address_family - addr_size = ctypes.c_int(ctypes.sizeof(addr)) - - if WSAStringToAddressA(ip_string, address_family, None, ctypes.byref(addr), ctypes.byref(addr_size)) != 0: - raise socket.error(ctypes.FormatError()) - - if address_family == socket.AF_INET: - return ctypes.string_at(addr.ipv4_addr, 4) - if address_family == socket.AF_INET6: - return ctypes.string_at(addr.ipv6_addr, 16) - raise socket.error('unknown address family') - - PY2 = (sys.version_info[0] == 2) unicode = type(u'') try: diff --git a/hyperlink/test/common.py b/hyperlink/test/common.py index 28eba527..14c4d434 100644 --- a/hyperlink/test/common.py +++ b/hyperlink/test/common.py @@ -2,6 +2,39 @@ from unittest import TestCase +try: + from socket import inet_pton +except ImportError: # pragma: no cover + # based on https://gist.github.com/nnemkin/4966028 + # this code only applies on Windows Python 2.7 + import ctypes + + class _sockaddr(ctypes.Structure): + _fields_ = [("sa_family", ctypes.c_short), + ("__pad1", ctypes.c_ushort), + ("ipv4_addr", ctypes.c_byte * 4), + ("ipv6_addr", ctypes.c_byte * 16), + ("__pad2", ctypes.c_ulong)] + + WSAStringToAddressA = ctypes.windll.ws2_32.WSAStringToAddressA + WSAAddressToStringA = ctypes.windll.ws2_32.WSAAddressToStringA + + def inet_pton(address_family, ip_string): + addr = _sockaddr() + ip_string = ip_string.encode('ascii') + addr.sa_family = address_family + addr_size = ctypes.c_int(ctypes.sizeof(addr)) + + if WSAStringToAddressA(ip_string, address_family, None, ctypes.byref(addr), ctypes.byref(addr_size)) != 0: + raise socket.error(ctypes.FormatError()) + + if address_family == socket.AF_INET: + return ctypes.string_at(addr.ipv4_addr, 4) + if address_family == socket.AF_INET6: + return ctypes.string_at(addr.ipv6_addr, 16) + raise socket.error('unknown address family') + + class HyperlinkTestCase(TestCase): """This type mostly exists to provide a backwards-compatible diff --git a/hyperlink/test/test_parse_host.py b/hyperlink/test/test_parse_host.py index 75f5ff07..4aa1eee5 100644 --- a/hyperlink/test/test_parse_host.py +++ b/hyperlink/test/test_parse_host.py @@ -3,10 +3,9 @@ from hyperlink import _url_codecs -from .common import HyperlinkTestCase +from .common import HyperlinkTestCase, inet_pton from .ipv6_test_cases import DW_IPv6_TEST_CASES - class TestParseHost(HyperlinkTestCase): def test_parse_host_dw_ipv6(self): for group in DW_IPv6_TEST_CASES: @@ -15,6 +14,9 @@ def test_parse_host_dw_ipv6(self): family, host = _url_codecs.parse_host(ip_text) assert family == socket.AF_INET6 assert ip_text == host + + inet_pton(socket.AF_INET6, host) # should not raise, as it's valid + continue with self.assertRaises(_url_codecs.URLParseError): @@ -24,4 +26,5 @@ def test_parse_host_dw_ipv6(self): # ipv6 and make the necessary correction if family != socket.AF_INET6: raise _url_codecs.URLParseError + return diff --git a/hyperlink/test/test_url.py b/hyperlink/test/test_url.py index 1e777648..3606b8a9 100644 --- a/hyperlink/test/test_url.py +++ b/hyperlink/test/test_url.py @@ -8,11 +8,11 @@ import sys import socket -from .common import HyperlinkTestCase +from .common import HyperlinkTestCase, inet_pton from .. import URL, URLParseError # automatically import the py27 windows implementation when appropriate from .. import _url -from .._url import inet_pton, SCHEME_PORT_MAP, parse_host +from .._url import SCHEME_PORT_MAP PY2 = (sys.version_info[0] == 2) From cde5cd8dad7eba632b356ab4051ae45800f02054 Mon Sep 17 00:00:00 2001 From: Mahmoud Hashemi Date: Sun, 14 Jan 2018 17:51:47 -0800 Subject: [PATCH 7/7] test package common.py needs socket import for windows-only path --- hyperlink/test/common.py | 1 + 1 file changed, 1 insertion(+) diff --git a/hyperlink/test/common.py b/hyperlink/test/common.py index 14c4d434..902e4bdb 100644 --- a/hyperlink/test/common.py +++ b/hyperlink/test/common.py @@ -2,6 +2,7 @@ from unittest import TestCase +import socket try: from socket import inet_pton except ImportError: # pragma: no cover