archive.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. #
  2. # Copyright 2018-2022 Elyra Authors
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import fnmatch
  17. import os
  18. import tarfile
  19. import tempfile
  20. WILDCARDS = ["*", "?", "["]
  21. def create_project_temp_dir():
  22. temp_dir = tempfile.gettempdir()
  23. project_temp_dir = os.path.join(temp_dir, "elyra")
  24. if not os.path.exists(project_temp_dir):
  25. os.mkdir(project_temp_dir)
  26. return project_temp_dir
  27. def directory_in_list(directory, filenames):
  28. """Checks if any entries in the filenames list starts with the given directory."""
  29. return any(name.startswith(directory + os.sep) or fnmatch.fnmatch(directory, name) for name in filenames)
  30. def has_wildcards(filename):
  31. """Returns True if the filename contains wildcard characters per https://docs.python.org/3/library/fnmatch.html"""
  32. return len(set(WILDCARDS) & set(list(filename))) > 0
  33. def directory_prefixed(filename):
  34. """Returns True if filename is prefixed by a directory (i.e., in a sub-directory."""
  35. return os.sep in filename and not filename.startswith(os.sep) and not filename.endswith(os.sep)
  36. def create_temp_archive(archive_name, source_dir, filenames=None, recursive=False, require_complete=False):
  37. """
  38. Create archive file with specified list of files
  39. :param archive_name: the name of the archive to be created
  40. :param source_dir: the root folder containing source files
  41. :param filenames: the list of filenames, each of which can contain wildcards and/or specify subdirectories
  42. :param recursive: flag to include sub directories recursively
  43. :param require_complete: flag to indicate an exception should be raised if all filenames are not included
  44. :return: full path of the created archive
  45. """
  46. def tar_filter(tarinfo):
  47. """Filter files from the generated archive"""
  48. if tarinfo.type == tarfile.DIRTYPE:
  49. # ignore hidden directories (e.g. ipynb checkpoints and/or trash contents)
  50. if any(dir.startswith(".") for dir in tarinfo.name.split("/")):
  51. return None
  52. # always return the base directory (empty string) otherwise tar will be empty
  53. elif not tarinfo.name:
  54. return tarinfo
  55. # only include subdirectories if enabled in common properties
  56. elif recursive:
  57. return tarinfo
  58. # We have a directory, check if any filenames start with this value and
  59. # allow if found - except if a single '*' is listed (i.e., include_all) in
  60. # which case we don't want to add this directory since recursive is False.
  61. # This occurs with filenames like `data/util.py` or `data/*.py`.
  62. elif not include_all and directory_in_list(tarinfo.name, filenames_set):
  63. return tarinfo
  64. return None
  65. # We have a file at this point...
  66. # Special case for single wildcard entries ('*')
  67. if include_all:
  68. return tarinfo
  69. # Process filename
  70. for filename in filenames_set:
  71. if not filename or filename in processed_filenames: # Skip processing
  72. continue
  73. # Match filename against candidate filename - handling wildcards
  74. if fnmatch.fnmatch(tarinfo.name, filename):
  75. # if this is a direct match, record that its been processed
  76. if not has_wildcards(filename) and not recursive:
  77. processed_filenames.append(filename)
  78. matched_set.add(filename)
  79. return tarinfo
  80. # If the filename is a "flat" wildcarded value (i.e., isn't prefixed with a directory name)
  81. # then we should take the basename of the candidate file to perform the match against. This
  82. # occurs for dependencies like *.py when include-subdirectories is enabled.
  83. if not directory_prefixed(filename) and has_wildcards(filename):
  84. if fnmatch.fnmatch(os.path.basename(tarinfo.name), filename):
  85. matched_set.add(filename)
  86. return tarinfo
  87. return None
  88. # Since filenames is essentially static, convert to set immediately and use the set
  89. filenames_set = set(filenames or [])
  90. # If there's a '*' - less things to check.
  91. include_all = len({WILDCARDS[0]} & filenames_set) > 0
  92. processed_filenames = []
  93. matched_set = set()
  94. temp_dir = create_project_temp_dir()
  95. archive = os.path.join(temp_dir, archive_name)
  96. with tarfile.open(archive, "w:gz", dereference=True) as tar:
  97. tar.add(source_dir, arcname="", filter=tar_filter)
  98. # Get the list of dependencies by discarding the first item of filenames, which is always the source file.
  99. dependencies_set = set([] if not filenames else filenames[1:])
  100. wildcard_expression_list = [f"{WILDCARDS[0]}.py", f"{WILDCARDS[0]}.r"] # Supported script file extensions.
  101. wildcard_expression = len(dependencies_set) == 1 and next(iter(dependencies_set)) in wildcard_expression_list
  102. if require_complete and not include_all:
  103. # Compare matched_set against filenames_set to ensure they're the same.
  104. # Tolerate no matching files when a single filename is a wildcard_expression.
  105. if len(filenames_set) > len(matched_set) and not wildcard_expression:
  106. raise FileNotFoundError(filenames_set - matched_set) # Only include the missing filenames
  107. return archive