-
Notifications
You must be signed in to change notification settings - Fork 103
Glob #161
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Glob #161
Changes from all commits
9153012
1904c50
48d2d2b
eb6f97d
b008c43
109e3be
00b2a03
e0966a2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
import fnmatch | ||
import re | ||
import posixpath | ||
alex-bo marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
|
||
def glob(client, hdfs_path): | ||
"""Return a list of paths matching a pathname pattern. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good doc! |
||
|
||
The pattern may contain simple shell-style wildcards a la | ||
fnmatch. However, unlike fnmatch, filenames starting with a | ||
dot are special cases that are not matched by '*' and '?' | ||
patterns. | ||
|
||
:param client: Instance of :class:`Client`. | ||
:param hdfs_path: HDFS path. May contain special characters like '*', '?' and '['. | ||
|
||
Sample usages: | ||
|
||
.. code-block:: python | ||
|
||
glob(client, './foo/bar/*') | ||
glob(client, './foo/bar/file[0-9].txt') | ||
glob(client, './foo/bar/file?.txt') | ||
|
||
""" | ||
return list(iglob(client, hdfs_path)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
||
|
||
def iglob(client, hdfs_path): | ||
"""Return an iterator which yields the paths matching a pathname pattern. | ||
|
||
The pattern may contain simple shell-style wildcards a la | ||
fnmatch. However, unlike fnmatch, filenames starting with a | ||
dot are special cases that are not matched by '*' and '?' | ||
patterns. | ||
|
||
:param client: Instance of :class:`Client`. | ||
:param hdfs_path: HDFS path. May contain special characters like '*', '?' and '['. | ||
|
||
Sample usages: | ||
|
||
.. code-block:: python | ||
|
||
iglob(client, './foo/bar/*') | ||
iglob(client, './foo/bar/file[0-9].txt') | ||
iglob(client, './foo/bar/file?.txt') | ||
|
||
""" | ||
dirname, basename = posixpath.split(hdfs_path) | ||
if not _has_magic(hdfs_path): | ||
if basename: | ||
if client.status(hdfs_path, strict=False): | ||
yield hdfs_path | ||
else: | ||
# Patterns ending with a slash should match only directories | ||
if client.status(dirname)['type'] == 'DIRECTORY': | ||
yield hdfs_path | ||
return | ||
if not dirname: | ||
for p in _glob1(client, None, basename): | ||
yield p | ||
return | ||
# `os.path.split()` returns the argument itself as a dirname if it is a | ||
# drive or UNC path. Prevent an infinite recursion if a drive or UNC path | ||
# contains magic characters (i.e. r'\\?\C:'). | ||
if dirname != hdfs_path and _has_magic(dirname): | ||
dirs = iglob(client, dirname) | ||
else: | ||
dirs = [dirname] | ||
if _has_magic(basename): | ||
glob_in_dir = _glob1 | ||
else: | ||
glob_in_dir = _glob0 | ||
for dirname in dirs: | ||
for name in glob_in_dir(client, dirname, basename): | ||
yield posixpath.join(dirname, name) | ||
|
||
|
||
def _glob1(client, dirname, pattern): | ||
if not dirname: | ||
if isinstance(pattern, bytes): | ||
dirname = bytes(client.resolve('.')) | ||
else: | ||
dirname = client.resolve('.') | ||
names = client.list(dirname) | ||
if not _ishidden(pattern): | ||
names = [x for x in names if not _ishidden(x)] | ||
return fnmatch.filter(names, pattern) | ||
|
||
|
||
def _glob0(client, dirname, basename): | ||
if not basename: | ||
# `os.path.split()` returns an empty basename for paths ending with a | ||
# directory separator. 'q*x/' should match only directories. | ||
if client.status(dirname)['type'] == 'DIRECTORY': | ||
return [basename] | ||
else: | ||
if client.status(posixpath.join(dirname, basename), strict=False): | ||
return [basename] | ||
return [] | ||
|
||
|
||
magic_check = re.compile('([*?[])') | ||
magic_check_bytes = re.compile(b'([*?[])') | ||
|
||
|
||
def _has_magic(s): | ||
if isinstance(s, bytes): | ||
match = magic_check_bytes.search(s) | ||
else: | ||
match = magic_check.search(s) | ||
return match is not None | ||
|
||
|
||
def _ishidden(path): | ||
return path[0] in ('.', b'.'[0]) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
import posixpath | ||
|
||
from nose.tools import eq_ | ||
|
||
from hdfs.glob import glob | ||
from util import _IntegrationTest | ||
|
||
|
||
class TestGlob(_IntegrationTest): | ||
|
||
def setup(self): | ||
super(TestGlob, self).setup() | ||
self.__build_dirs() | ||
|
||
def __build_dirs(self): | ||
""" | ||
Structure: | ||
|
||
dir_1 | ||
dir_1_1 | ||
file_1_3_1.txt | ||
dir_1_2 | ||
file_1_3_1.txt | ||
dir_1_3 | ||
file_1_3_1.txt | ||
file_1_3_2.txt | ||
file_1_3_3.txt | ||
file_1_1.txt | ||
dir_2 | ||
dir_2_1 | ||
file_2_3_1.txt | ||
dir_2_2 | ||
file_2_3_1.txt | ||
dir_2_3 | ||
file_2_3_1.txt | ||
file_2_3_2.txt | ||
file_2_3_3.txt | ||
file_2_1.txt | ||
""" | ||
self._write(posixpath.join('dir_1', 'dir_1_1', 'file_1_3_1.txt'), b'file_1_3_1') | ||
self._write(posixpath.join('dir_1', 'dir_1_2', 'file_1_3_1.txt'), b'file_1_3_1') | ||
self._write(posixpath.join('dir_1', 'dir_1_3', 'file_1_3_1.txt'), b'file_1_3_1') | ||
self._write(posixpath.join('dir_1', 'dir_1_3', 'file_1_3_2.txt'), b'file_1_3_2') | ||
self._write(posixpath.join('dir_1', 'dir_1_3', 'file_1_3_3.txt'), b'file_1_3_3') | ||
self._write(posixpath.join('dir_1', 'file_1_1.txt'), b'file_1_1') | ||
self._write(posixpath.join('dir_2', 'dir_2_1', 'file_2_3_1.txt'), b'file_2_3_1') | ||
self._write(posixpath.join('dir_2', 'dir_2_2', 'file_2_2_1.txt'), b'file_2_2_1') | ||
self._write(posixpath.join('dir_2', 'dir_2_3', 'file_2_3_1.txt'), b'file_2_3_1') | ||
self._write(posixpath.join('dir_2', 'dir_2_3', 'file_2_3_2.txt'), b'file_2_3_2') | ||
self._write(posixpath.join('dir_2', 'dir_2_3', 'file_2_3_3.txt'), b'file_2_3_3') | ||
self._write(posixpath.join('dir_2', 'file_2_1.txt'), b'file_2_1') | ||
|
||
def test(self): | ||
values = [ | ||
('./dir_1/dir_1_3/*', [ | ||
'./dir_1/dir_1_3/file_1_3_1.txt', | ||
'./dir_1/dir_1_3/file_1_3_2.txt', | ||
'./dir_1/dir_1_3/file_1_3_3.txt', | ||
]), | ||
('./dir_2/dir_2_3/file_2_3_?.txt', [ | ||
'./dir_2/dir_2_3/file_2_3_1.txt', | ||
'./dir_2/dir_2_3/file_2_3_2.txt', | ||
'./dir_2/dir_2_3/file_2_3_3.txt', | ||
]), | ||
('*/*.txt', [ | ||
'dir_1/file_1_1.txt', | ||
'dir_2/file_2_1.txt', | ||
]), | ||
('./dir_[1-2]/file_[1-2]_1.txt', [ | ||
'./dir_1/file_1_1.txt', | ||
'./dir_2/file_2_1.txt', | ||
]), | ||
('./dir_*/dir_*/file_[1-2]_3_2.txt', [ | ||
'./dir_1/dir_1_3/file_1_3_2.txt', | ||
'./dir_2/dir_2_3/file_2_3_2.txt', | ||
]), | ||
('./dir_[3-4]/file_[1-2]_1.txt', []), | ||
('./dir_*/dir_*/file_[3-4]_3_2.txt', []), | ||
] | ||
for pattern, expected in values: | ||
actual = glob(self.client, pattern) | ||
eq_(expected, actual, 'Unexpected result for pattern ' + pattern) | ||
|
||
|
||
|
Uh oh!
There was an error while loading. Please reload this page.