|
@@ -3,6 +3,7 @@ from app.core.airflow.uri import spark_result_tb_name
|
|
|
from app.schemas import AirflowTask
|
|
|
from jinja2 import Environment, PackageLoader, select_autoescape
|
|
|
from app.common.minio import FileHandler
|
|
|
+from configs.settings import config
|
|
|
|
|
|
|
|
|
class TaskCompiler:
|
|
@@ -42,24 +43,24 @@ class TaskCompiler:
|
|
|
class JavaTaskCompiler(TaskCompiler):
|
|
|
def __init__(self, item: AirflowTask):
|
|
|
super(JavaTaskCompiler, self).__init__(item)
|
|
|
- self.default_image = 'SXKJ:32775/java:1.0'
|
|
|
+ self.default_image = config.get('TASK_IMAGES', 'java') # 'SXKJ:32775/java:1.0'
|
|
|
self.default_cmd = "echo \"$SCRIPT\" > run.py && python run.py"
|
|
|
self.task.cmd = self.task.cmd or self.default_cmd
|
|
|
tar_name = self.task.file_urls[0].split('/')[-1].split('_')[-1]
|
|
|
- self.task.cmd = f'curl {"http://minio.default:9000"}/{self.task.file_urls[0]} --output {tar_name} && {self.task.cmd}'
|
|
|
+ self.task.cmd = f'curl {config.get("MINIO", "k8s_url")}/{self.task.file_urls[0]} --output {tar_name} && {self.task.cmd}'
|
|
|
|
|
|
|
|
|
class PythonTaskCompiler(TaskCompiler):
|
|
|
def __init__(self, item: AirflowTask):
|
|
|
super(PythonTaskCompiler, self).__init__(item)
|
|
|
- self.default_image = 'SXKJ:32775/pod_python:1.1'
|
|
|
+ self.default_image = config.get('TASK_IMAGES', 'python') # 'SXKJ:32775/pod_python:1.1'
|
|
|
self.default_cmd = "echo \"$SCRIPT\" > run.py && python run.py"
|
|
|
|
|
|
|
|
|
class DataXTaskCompiler(TaskCompiler):
|
|
|
def __init__(self, item: AirflowTask):
|
|
|
super(DataXTaskCompiler, self).__init__(item)
|
|
|
- self.default_image = 'SXKJ:32775/pod_datax:0.9'
|
|
|
+ self.default_image = config.get('TASK_IMAGES', 'datax') # 'SXKJ:32775/pod_datax:0.9'
|
|
|
self.default_cmd = f"cd datax/bin && echo \"$SCRIPT\" > transform_datax.py &&cat transform_datax.py && " \
|
|
|
f"python3 transform_datax.py && cat config.json && $HOME/conda/envs/py27/b" \
|
|
|
f"in/python datax.py {self.task.cmd_parameters} config.json "
|
|
@@ -94,24 +95,25 @@ class DataXTaskCompiler(TaskCompiler):
|
|
|
class SparksTaskCompiler(TaskCompiler):
|
|
|
def __init__(self, item: AirflowTask):
|
|
|
super(SparksTaskCompiler, self).__init__(item)
|
|
|
- self.default_image = 'SXKJ:32775/jupyter:0.96'
|
|
|
+ self.default_image = config.get('TASK_IMAGES', 'sparks')
|
|
|
parameters = {"master": "yarn",
|
|
|
"deploy-mode": "cluster",
|
|
|
- "driver-memory": "2g",
|
|
|
+ "driver-memory": "1g",
|
|
|
"driver-cores ": 1,
|
|
|
- "executor-memory": "2g",
|
|
|
- "executor-cores": 4,
|
|
|
+ "executor-memory": "1g",
|
|
|
+ "executor-cores": 1,
|
|
|
"num-executors": 1,
|
|
|
"archives": "/home/sxkj/bigdata/py37.zip#python3env"
|
|
|
}
|
|
|
- spark_config = {'spark.default.parallelism': 2,
|
|
|
- "spark.executor.memoryOverhead": "4g",
|
|
|
- "spark.driver.memoryOverhead": "2g",
|
|
|
- "spark.yarn.maxAppAttempts": 3,
|
|
|
+ spark_config = {'spark.default.parallelism': 1,
|
|
|
+ "spark.executor.memoryOverhead": "1g",
|
|
|
+ "spark.driver.memoryOverhead": "1g",
|
|
|
+ "spark.yarn.maxAppAttempts": 1,
|
|
|
"spark.yarn.submit.waitAppCompletion": "true",
|
|
|
"spark.pyspark.driver.python": "python3env/py37/bin/python",
|
|
|
"spark.yarn.appMasterEnv.PYSPARK_PYTHON": "python3env/py37/bin/python",
|
|
|
- "spark.pyspark.python": "python3env/py37/bin/python"
|
|
|
+ "spark.pyspark.python": "python3env/py37/bin/python",
|
|
|
+ "spark.sql.legacy.allowCreatingManagedTableUsingNonemptyLocation": "true"
|
|
|
}
|
|
|
param_str = ' '.join([f'--{k} {v}' for k, v in parameters.items()])
|
|
|
param_str += ''.join([f' --conf {k}={v}' for k, v in spark_config.items()])
|
|
@@ -156,14 +158,14 @@ class SparksTaskCompiler(TaskCompiler):
|
|
|
skip_nodes.append(info["id"])
|
|
|
continue
|
|
|
if info['op'] == 'sql':
|
|
|
- inputs = {}
|
|
|
template_file = 'sql_script_template.py.jinja2'
|
|
|
elif info['op'] == 'pyspark':
|
|
|
- inputs = {k: spark_result_tb_name(job_id=job_id, task_id=self.task.id, spark_node_id=v[0],
|
|
|
- out_pin=v[1], is_tmp=task_mode) for k, v in info['inputs'].items()}
|
|
|
template_file = 'pyspark_script_template.py.jinja2'
|
|
|
else:
|
|
|
continue
|
|
|
+ inputs = {k: spark_result_tb_name(job_id=job_id, task_id=self.task.id, spark_node_id=v[0],
|
|
|
+ out_pin=v[1], is_tmp=task_mode) for k, v in
|
|
|
+ info.get('inputs', {}).items()}
|
|
|
outputs = [spark_result_tb_name(job_id=job_id, task_id=self.task.id, spark_node_id=info['id'],
|
|
|
out_pin=0, is_tmp=task_mode)]
|
|
|
sub_node = {
|
|
@@ -175,7 +177,7 @@ class SparksTaskCompiler(TaskCompiler):
|
|
|
template_file=template_file),
|
|
|
},
|
|
|
'cmds': ['/bin/bash', '-c', self.cmd_str(name=f'spark_{self.task.id}_{info["id"]}')],
|
|
|
- 'image': "SXKJ:32775/jupyter:0.96",
|
|
|
+ 'image': config.get('TASK_IMAGES', 'sparks')
|
|
|
}
|
|
|
sub_nodes.append(sub_node)
|
|
|
|