Skip to content

Correctly detect final path when extracting multiple tarballs #4922

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 34 additions & 22 deletions easybuild/tools/filetools.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,10 +505,19 @@ def extract_file(fn, dest, cmd=None, extra_options=None, overwrite=False, forced
if extra_options:
cmd = f"{cmd} {extra_options}"

previous_paths = set(_get_paths_purged(abs_dest))
run_shell_cmd(cmd, in_dry_run=forced, hidden=not trace)

# note: find_base_dir also changes into the base dir!
base_dir = find_base_dir()
new_paths = set(_get_paths_purged(abs_dest)) - previous_paths
if len(new_paths) == 1:
new_path = new_paths.pop()
if os.path.isdir(new_path):
# note: find_base_dir also changes into the base dir!
base_dir = find_base_dir(new_path)
else:
base_dir = abs_dest
else:
base_dir = abs_dest
_log.debug(f"Multiple new folder/files found after extraction: {new_paths}. Using {base_dir} as base dir.")

# if changing into obtained directory is not desired,
# change back to where we came from (unless that was a non-existing directory)
Expand Down Expand Up @@ -1412,36 +1421,39 @@ def is_sha256_checksum(value):
return res


def find_base_dir():
"""
Try to locate a possible new base directory
- this is typically a single subdir, e.g. from untarring a tarball
- when extracting multiple tarballs in the same directory,
expect only the first one to give the correct path
def _get_paths_purged(path):
"""Find all files and folders in the folder

Ignore hidden entries and e.g. log directories
"""
def get_local_dirs_purged():
# e.g. always purge the log directory
# and hidden directories
ignoredirs = ["easybuild"]
IGNORED_DIRS = ["easybuild"]

lst = os.listdir(get_cwd())
lst = [d for d in lst if not d.startswith('.') and d not in ignoredirs]
return lst
lst = os.listdir(path)
lst = [p for p in lst if not p.startswith('.') and p not in IGNORED_DIRS]
return lst

lst = get_local_dirs_purged()
new_dir = get_cwd()

def find_base_dir(path=None):
"""
Locate a possible new base directory from the current working directory or the specified one and change to it.

This is typically a single subdir, e.g. from untarring a tarball.
It recurses into subfolders as long as that subfolder is the only child (file or folder)
and returns the current(ly processed) folder if multiple or no childs exist in it.
"""
new_dir = get_cwd() if path is None else os.path.abspath(path)
lst = _get_paths_purged(new_dir)
while len(lst) == 1:
new_dir = os.path.join(get_cwd(), lst[0])
new_dir = os.path.join(new_dir, lst[0])
if not os.path.isdir(new_dir):
break

change_dir(new_dir)
lst = get_local_dirs_purged()
lst = _get_paths_purged(new_dir)

# make sure it's a directory, and not a (single) file that was in a tarball for example
while not os.path.isdir(new_dir):
new_dir = os.path.dirname(new_dir)

change_dir(new_dir)
_log.debug("Last dir list %s" % lst)
_log.debug("Possible new dir %s found" % new_dir)
return new_dir
Expand Down
6 changes: 4 additions & 2 deletions easybuild/tools/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,8 +430,10 @@ def download_repo(repo=GITHUB_EASYCONFIGS_REPO, branch=None, commit=None, accoun
else:
_log.debug("%s downloaded to %s, extracting now", base_name, path)

base_dir = extract_file(target_path, path, forced=True, change_into_dir=False, trace=False)
extracted_path = os.path.join(base_dir, extracted_dir_name)
extracted_path = extract_file(target_path, path, forced=True, change_into_dir=False, trace=False)
if extracted_path != expected_path:
raise EasyBuildError(f"Unexpected directory '{extracted_path} for extracted repo. Expected: {expected_path}",
exit_code=EasyBuildExit.FAIL_EXTRACT)

# check if extracted_path exists
if not os.path.isdir(extracted_path):
Expand Down
155 changes: 125 additions & 30 deletions test/framework/filetools.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
import shutil
import stat
import sys
import tarfile
import tempfile
import textwrap
import time
Expand Down Expand Up @@ -166,11 +167,39 @@ def test_find_base_dir(self):

foodir = os.path.join(tmpdir, 'foo')
os.mkdir(foodir)
os.mkdir(os.path.join(tmpdir, '.bar'))
os.mkdir(os.path.join(tmpdir, 'easybuild'))

# No files
os.chdir(tmpdir)
self.assertTrue(os.path.samefile(foodir, ft.find_base_dir()))
# Uses specified path
os.chdir(self.test_prefix)
self.assertTrue(os.path.samefile(foodir, ft.find_base_dir(tmpdir)))

# Only ignored files/folders
os.mkdir(os.path.join(tmpdir, '.bar'))
os.mkdir(os.path.join(tmpdir, 'easybuild'))
self.assertTrue(os.path.samefile(foodir, ft.find_base_dir(tmpdir)))

# Subfolder
bardir = os.path.join(foodir, 'bar')
os.mkdir(bardir)
self.assertTrue(os.path.samefile(bardir, ft.find_base_dir(tmpdir)))
# With ignored folder in subfolder
os.mkdir(os.path.join(bardir, '.bar'))
self.assertTrue(os.path.samefile(bardir, ft.find_base_dir(tmpdir)))

# Test recursiveness
subdir = os.path.join(bardir, 'sub')
os.mkdir(subdir)
self.assertTrue(os.path.samefile(subdir, ft.find_base_dir(tmpdir)))
# Only file(s) in subfolder
ft.write_file(os.path.join(subdir, 'src.c'), 'code')
self.assertTrue(os.path.samefile(subdir, ft.find_base_dir(tmpdir)))
ft.write_file(os.path.join(subdir, 'src2.c'), 'code')
self.assertTrue(os.path.samefile(subdir, ft.find_base_dir(tmpdir)))

# Files and folders in subfolder
os.mkdir(os.path.join(bardir, 'subdir'))
self.assertTrue(os.path.samefile(bardir, ft.find_base_dir(tmpdir)))

def test_find_glob_pattern(self):
"""test find_glob_pattern function"""
Expand Down Expand Up @@ -1865,7 +1894,7 @@ def test_apply_patch(self):
for with_backup in (True, False):
update_build_option('backup_patched_files', with_backup)
self.assertTrue(ft.apply_patch(toy_patch, path))
src_file = os.path.join(path, 'toy-0.0', 'toy.source')
src_file = os.path.join(path, 'toy.source')
backup_file = src_file + '.orig'
patched = ft.read_file(src_file)
pattern = "I'm a toy, and very proud of it"
Expand All @@ -1882,7 +1911,7 @@ def test_apply_patch(self):
toy_patch_gz = os.path.join(testdir, 'sandbox', 'sources', 'toy', 'toy-0.0_gzip.patch.gz')
with self.mocked_stdout_stderr():
self.assertTrue(ft.apply_patch(toy_patch_gz, path))
patched_gz = ft.read_file(os.path.join(path, 'toy-0.0', 'toy.source'))
patched_gz = ft.read_file(os.path.join(path, 'toy.source'))
pattern = "I'm a toy, and very very proud of it"
self.assertIn(pattern, patched_gz)

Expand All @@ -1893,7 +1922,7 @@ def test_apply_patch(self):
with self.mocked_stdout_stderr():
ft.apply_patch(toy_patch_gz, path, options=' --reverse')
# Change was really removed
self.assertNotIn(pattern, ft.read_file(os.path.join(path, 'toy-0.0', 'toy.source')))
self.assertNotIn(pattern, ft.read_file(os.path.join(path, 'toy.source')))

# test copying of files, both to an existing directory and a non-existing location
test_file = os.path.join(self.test_prefix, 'foo.txt')
Expand Down Expand Up @@ -2471,31 +2500,46 @@ def test_change_dir(self):
foo = os.path.join(self.test_prefix, 'foo')
self.assertErrorRegex(EasyBuildError, "Failed to change from .* to %s" % foo, ft.change_dir, foo)

def create_new_tarball(self, folder, filename=None):
"""Create new tarball with contents of folder and return path"""
if filename is None:
tarball = tempfile.mktemp(suffix='.tar.gz')
else:
tarball = os.path.join(tempfile.mkdtemp(), filename)
with tarfile.open(tarball, "w:gz") as tar:
for name in glob.glob(os.path.join(folder, '*')):
tar.add(name, arcname=os.path.basename(name))
return tarball

def test_extract_file(self):
"""Test extract_file"""
cwd = os.getcwd()

testdir = os.path.dirname(os.path.abspath(__file__))
toy_tarball = os.path.join(testdir, 'sandbox', 'sources', 'toy', 'toy-0.0.tar.gz')
test_src = tempfile.mkdtemp()
ft.mkdir(os.path.join(test_src, 'toy-0.0'))
ft.write_file(os.path.join(test_src, 'toy-0.0', 'toy.source'), 'content')
toy_tarball = self.create_new_tarball(test_src, filename='toy-0.0.tar.gz')

self.assertNotExists(os.path.join(self.test_prefix, 'toy-0.0', 'toy.source'))
extraction_path = os.path.join(self.test_prefix, 'extraction') # New directory
toy_path = os.path.join(extraction_path, 'toy-0.0')
self.assertNotExists(os.path.join(toy_path, 'toy.source'))
with self.mocked_stdout_stderr():
path = ft.extract_file(toy_tarball, self.test_prefix, change_into_dir=False)
self.assertExists(os.path.join(self.test_prefix, 'toy-0.0', 'toy.source'))
self.assertTrue(os.path.samefile(path, self.test_prefix))
path = ft.extract_file(toy_tarball, extraction_path, change_into_dir=False)
self.assertExists(os.path.join(toy_path, 'toy.source'))
self.assertTrue(os.path.samefile(path, toy_path))
# still in same directory as before if change_into_dir is set to False
self.assertTrue(os.path.samefile(os.getcwd(), cwd))
shutil.rmtree(os.path.join(path, 'toy-0.0'))
ft.remove_dir(toy_path)

toy_tarball_renamed = os.path.join(self.test_prefix, 'toy_tarball')
shutil.copyfile(toy_tarball, toy_tarball_renamed)

with self.mocked_stdout_stderr():
path = ft.extract_file(toy_tarball_renamed, self.test_prefix, cmd="tar xfvz %s", change_into_dir=False)
path = ft.extract_file(toy_tarball_renamed, extraction_path, cmd="tar xfvz %s", change_into_dir=False)
self.assertTrue(os.path.samefile(os.getcwd(), cwd))
self.assertExists(os.path.join(self.test_prefix, 'toy-0.0', 'toy.source'))
self.assertTrue(os.path.samefile(path, self.test_prefix))
shutil.rmtree(os.path.join(path, 'toy-0.0'))
self.assertExists(os.path.join(toy_path, 'toy.source'))
self.assertTrue(os.path.samefile(path, toy_path))
ft.remove_dir(toy_path)

# also test behaviour of extract_file under --dry-run
build_options = {
Expand All @@ -2505,47 +2549,98 @@ def test_extract_file(self):
init_config(build_options=build_options)

self.mock_stdout(True)
path = ft.extract_file(toy_tarball, self.test_prefix, change_into_dir=False)
path = ft.extract_file(toy_tarball, extraction_path, change_into_dir=False)
txt = self.get_stdout()
self.mock_stdout(False)
self.assertTrue(os.path.samefile(os.getcwd(), cwd))

self.assertTrue(os.path.samefile(path, self.test_prefix))
self.assertNotExists(os.path.join(self.test_prefix, 'toy-0.0'))
self.assertTrue(os.path.samefile(path, extraction_path))
self.assertNotExists(toy_path)
self.assertTrue(re.search('running shell command "tar xzf .*/toy-0.0.tar.gz"', txt))

with self.mocked_stdout_stderr():
path = ft.extract_file(toy_tarball, self.test_prefix, forced=True, change_into_dir=False)
self.assertExists(os.path.join(self.test_prefix, 'toy-0.0', 'toy.source'))
self.assertTrue(os.path.samefile(path, self.test_prefix))
path = ft.extract_file(toy_tarball, extraction_path, forced=True, change_into_dir=False)
self.assertExists(os.path.join(toy_path, 'toy.source'))
self.assertTrue(os.path.samefile(path, toy_path))
self.assertTrue(os.path.samefile(os.getcwd(), cwd))

build_options['extended_dry_run'] = False
init_config(build_options=build_options)

ft.remove_dir(os.path.join(self.test_prefix, 'toy-0.0'))
ft.remove_dir(toy_path)

ft.change_dir(cwd)
self.assertFalse(os.path.samefile(os.getcwd(), self.test_prefix))
self.assertFalse(os.path.samefile(os.getcwd(), extraction_path))

with self.mocked_stdout_stderr():
path = ft.extract_file(toy_tarball, self.test_prefix, change_into_dir=True)
path = ft.extract_file(toy_tarball, extraction_path, change_into_dir=True)
stdout = self.get_stdout()
stderr = self.get_stderr()

self.assertTrue(os.path.samefile(path, self.test_prefix))
self.assertTrue(os.path.samefile(os.getcwd(), self.test_prefix))
self.assertTrue(os.path.samefile(path, toy_path))
self.assertTrue(os.path.samefile(os.getcwd(), toy_path))
self.assertFalse(stderr)
self.assertTrue("running shell command" in stdout)

# check whether disabling trace output works
with self.mocked_stdout_stderr():
path = ft.extract_file(toy_tarball, self.test_prefix, change_into_dir=True, trace=False)
path = ft.extract_file(toy_tarball, extraction_path, change_into_dir=True, trace=False)
stdout = self.get_stdout()
stderr = self.get_stderr()
self.assertFalse(stderr)
self.assertFalse(stdout)

# Test tarball with multiple folders
test_src = tempfile.mkdtemp()
ft.mkdir(os.path.join(test_src, 'multi-1.0'))
ft.write_file(os.path.join(test_src, 'multi-1.0', 'src.c'), 'content')
ft.mkdir(os.path.join(test_src, 'multi-bonus'))
ft.write_file(os.path.join(test_src, 'multi-bonus', 'src.c'), 'content')
test_tarball = self.create_new_tarball(test_src)
# Start fresh
ft.remove_dir(extraction_path)
ft.change_dir(cwd)
with self.mocked_stdout_stderr():
path = ft.extract_file(test_tarball, extraction_path, change_into_dir=True)
self.assertTrue(os.path.samefile(path, extraction_path))
self.assertTrue(os.path.samefile(os.getcwd(), extraction_path)) # NOT a subfolder
self.assertExists(os.path.join(extraction_path, 'multi-1.0'))
self.assertExists(os.path.join(extraction_path, 'multi-bonus'))

# Extract multiple files with single folder to same folder, and file only
test_src = tempfile.mkdtemp()
ft.mkdir(os.path.join(test_src, 'bar-0.0'))
ft.write_file(os.path.join(test_src, 'bar-0.0', 'bar.source'), 'content')
bar_tarball = self.create_new_tarball(test_src)

test_src = tempfile.mkdtemp()
ft.write_file(os.path.join(test_src, 'main.source'), 'content')
file_tarball = self.create_new_tarball(test_src)

ft.remove_dir(extraction_path)
ft.change_dir(cwd)
with self.mocked_stdout_stderr():
path = ft.extract_file(toy_tarball, extraction_path, change_into_dir=False)
self.assertTrue(os.path.samefile(path, toy_path))
path = ft.extract_file(bar_tarball, extraction_path, change_into_dir=False)
self.assertTrue(os.path.samefile(path, os.path.join(extraction_path, 'bar-0.0')))
# Contains no folder
path = ft.extract_file(file_tarball, extraction_path, change_into_dir=False)
self.assertTrue(os.path.samefile(path, extraction_path))

# Folder and file
test_src = tempfile.mkdtemp()
ft.mkdir(os.path.join(test_src, 'multi-1.0'))
ft.write_file(os.path.join(test_src, 'multi-1.0', 'src.c'), 'content')
ft.write_file(os.path.join(test_src, 'main.c'), 'content')
test_tarball = self.create_new_tarball(test_src)
# When there is only a file or a file next to the folder the parent dir is returned
for tarball in (file_tarball, test_tarball):
ft.remove_dir(extraction_path)
with self.mocked_stdout_stderr():
path = ft.extract_file(tarball, extraction_path, change_into_dir=False)
self.assertTrue(os.path.samefile(path, extraction_path))

def test_remove(self):
"""Test remove_file, remove_dir and join remove functions."""
testfile = os.path.join(self.test_prefix, 'foo')
Expand Down Expand Up @@ -2828,7 +2923,7 @@ def test_search_file(self):
# to avoid accidental matches in other files already present (log files, etc.)
ec_dir = tempfile.mkdtemp()
test_ec = os.path.join(ec_dir, 'netCDF-C++-4.2-foss-2019a.eb')
ft.write_file(test_ec, ''),
ft.write_file(test_ec, '')
for pattern in ['netCDF-C++', 'CDF', 'C++', '^netCDF']:
var_defs, hits = ft.search_file([ec_dir], pattern, terse=True, filename_only=True)
self.assertEqual(var_defs, [], msg='For pattern ' + pattern)
Expand Down