parser.py 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
  1. #
  2. # Copyright 2018-2022 Elyra Authors
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import os
  17. import re
  18. from typing import Dict
  19. from typing import List
  20. from typing import TypeVar
  21. import nbformat
  22. from traitlets.config import LoggingConfigurable
  23. # Setup forward reference for type hint on return from class factory method. See
  24. # https://stackoverflow.com/questions/39205527/can-you-annotate-return-type-when-value-is-instance-of-cls/39205612#39205612
  25. F = TypeVar("F", bound="FileReader")
  26. class FileReader(LoggingConfigurable):
  27. """
  28. Base class for parsing a file for resources according to operation type. Subclasses set
  29. their own parser member variable according to their implementation language.
  30. """
  31. def __init__(self, filepath: str, **kwargs):
  32. super().__init__(**kwargs)
  33. self._filepath = filepath
  34. @property
  35. def filepath(self):
  36. return self._filepath
  37. @property
  38. def language(self) -> str:
  39. file_extension = os.path.splitext(self._filepath)[-1]
  40. if file_extension == ".py":
  41. return "python"
  42. elif file_extension == ".r":
  43. return "r"
  44. else:
  45. return None
  46. def read_next_code_chunk(self) -> List[str]:
  47. """
  48. Implements a generator for lines of code in the specified filepath. Subclasses
  49. may override if explicit line-by-line parsing is not feasible, e.g. with Notebooks.
  50. """
  51. with open(self._filepath) as f:
  52. for line in f:
  53. yield [line.strip()]
  54. class NotebookReader(FileReader):
  55. def __init__(self, filepath: str, **kwargs):
  56. super().__init__(filepath, **kwargs)
  57. with open(self._filepath) as f:
  58. self._notebook = nbformat.read(f, as_version=4)
  59. self._language = None
  60. try:
  61. self._language = self._notebook["metadata"]["kernelspec"]["language"].lower()
  62. except KeyError:
  63. self.log.warning(f"No language metadata found in {self._filepath}")
  64. @property
  65. def language(self) -> str:
  66. return self._language
  67. def read_next_code_chunk(self) -> List[str]:
  68. for cell in self._notebook.cells:
  69. if cell.source and cell.cell_type == "code":
  70. yield cell.source.split("\n")
  71. class ScriptParser(object):
  72. """
  73. Base class for parsing individual lines of code. Subclasses implement a search_expressions()
  74. function that returns language-specific regexes to match against code lines.
  75. """
  76. _comment_char = "#"
  77. def _get_line_without_comments(self, line):
  78. if self._comment_char in line:
  79. index = line.find(self._comment_char)
  80. line = line[:index]
  81. return line.strip()
  82. def parse_environment_variables(self, line):
  83. # Parse a line fed from file and match each regex in regex dictionary
  84. line = self._get_line_without_comments(line)
  85. if not line:
  86. return []
  87. matches = []
  88. for key, value in self.search_expressions().items():
  89. for pattern in value:
  90. regex = re.compile(pattern)
  91. for match in regex.finditer(line):
  92. matches.append((key, match))
  93. return matches
  94. class PythonScriptParser(ScriptParser):
  95. def search_expressions(self) -> Dict[str, List]:
  96. # TODO: add more key:list-of-regex pairs to parse for additional resources
  97. regex_dict = dict()
  98. # First regex matches envvar assignments of form os.environ["name"] = value w or w/o value provided
  99. # Second regex matches envvar assignments that use os.getenv("name", "value") with ow w/o default provided
  100. # Third regex matches envvar assignments that use os.environ.get("name", "value") with or w/o default provided
  101. # Both name and value are captured if possible
  102. envs = [
  103. r"os\.environ\[[\"']([a-zA-Z_]+[A-Za-z0-9_]*)[\"']\](?:\s*=(?:\s*[\"'](.[^\"']*)?[\"'])?)*",
  104. r"os\.getenv\([\"']([a-zA-Z_]+[A-Za-z0-9_]*)[\"'](?:\s*\,\s*[\"'](.[^\"']*)?[\"'])?",
  105. r"os\.environ\.get\([\"']([a-zA-Z_]+[A-Za-z0-9_]*)[\"'](?:\s*\,(?:\s*[\"'](.[^\"']*)?[\"'])?)*",
  106. ]
  107. regex_dict["env_vars"] = envs
  108. return regex_dict
  109. class RScriptParser(ScriptParser):
  110. def search_expressions(self) -> Dict[str, List]:
  111. # TODO: add more key:list-of-regex pairs to parse for additional resources
  112. regex_dict = dict()
  113. # Tests for matches of the form Sys.setenv("key" = "value")
  114. envs = [
  115. r"Sys\.setenv\([\"']*([a-zA-Z_]+[A-Za-z0-9_]*)[\"']*\s*=\s*[\"']*(.[^\"']*)?[\"']*\)",
  116. r"Sys\.getenv\([\"']*([a-zA-Z_]+[A-Za-z0-9_]*)[\"']*\)(.)*",
  117. ]
  118. regex_dict["env_vars"] = envs
  119. return regex_dict
  120. class ContentParser(LoggingConfigurable):
  121. parsers = {"python": PythonScriptParser(), "r": RScriptParser()}
  122. def parse(self, filepath: str) -> dict:
  123. """Returns a model dictionary of all the regex matches for each key in the regex dictionary"""
  124. properties = {"env_vars": {}, "inputs": [], "outputs": []}
  125. reader = self._get_reader(filepath)
  126. parser = self._get_parser(reader.language)
  127. if not parser:
  128. return properties
  129. for chunk in reader.read_next_code_chunk():
  130. if chunk:
  131. for line in chunk:
  132. matches = parser.parse_environment_variables(line)
  133. for key, match in matches:
  134. if key == "env_vars":
  135. properties[key][match.group(1)] = match.group(2)
  136. else:
  137. properties[key].append(match.group(1))
  138. return properties
  139. def _validate_file(self, filepath: str):
  140. """
  141. Validate file exists and is file (e.g. not a directory)
  142. """
  143. if not os.path.exists(filepath):
  144. raise FileNotFoundError(f"No such file or directory: {filepath}")
  145. if not os.path.isfile(filepath):
  146. raise IsADirectoryError(f"Is a directory: {filepath}")
  147. def _get_reader(self, filepath: str):
  148. """
  149. Find the proper reader based on the file extension
  150. """
  151. file_extension = os.path.splitext(filepath)[-1]
  152. self._validate_file(filepath)
  153. if file_extension == ".ipynb":
  154. return NotebookReader(filepath)
  155. elif file_extension in [".py", ".r"]:
  156. return FileReader(filepath)
  157. else:
  158. raise ValueError(f"File type {file_extension} is not supported.")
  159. def _get_parser(self, language: str):
  160. """
  161. Find the proper parser based on content language
  162. """
  163. parser = None
  164. if language:
  165. parser = self.parsers.get(language)
  166. if not parser:
  167. self.log.warning(f"Content parser for {language} is not available.")
  168. return parser