spark_script_demo_1009.py 2.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. import json
  2. import sys
  3. from pyspark.sql.types import *
  4. from pyspark.ml.classification import LogisticRegression
  5. from pyspark.ml.feature import VectorAssembler
  6. from pyspark.ml import Pipeline
  7. from pyspark.sql.functions import udf, col
  8. from pyspark.sql import SparkSession, DataFrame
  9. # argv[0] inputs:{"input1_key":"input1_path","input2_key":"input2_path",..}
  10. # argv[1] outputs: [result_path1,result_path2...]
  11. def run(inputs: dict, outputs: list):
  12. spark = SparkSession.builder.config('hive.metastore.uris',
  13. 'thrift://192.168.199.27:9083').enableHiveSupport().getOrCreate()
  14. param_dict = preprocess(input_infos=inputs, ss=spark)
  15. rets = main_func(**param_dict)
  16. postprocess(rets=rets, outputs=outputs)
  17. def read_table(ss: SparkSession, tb_name: str) -> DataFrame:
  18. return ss.sql(f'select * from {tb_name}')
  19. def write_table(df: DataFrame, tb_name: str):
  20. df.write.mode("overwrite").saveAsTable(tb_name)
  21. def preprocess(input_infos: dict, ss: SparkSession) -> dict:
  22. return {k: read_table(ss=ss, tb_name=v) for k, v in input_infos.items()}
  23. def postprocess(rets, outputs):
  24. [write_table(df=df, tb_name=outputs[idx]) for idx, df in enumerate(rets)]
  25. def to_array(col):
  26. def to_array_(v):
  27. return v.toArray().tolist()
  28. return udf(to_array_, ArrayType(DoubleType())).asNondeterministic()(col)
  29. def main_func(train_df: DataFrame, test_df: DataFrame):
  30. feat_cols = ['feature1', 'feature2', 'feature3', 'feature4', 'feature5', 'feature6', 'feature7', 'feature8',
  31. 'feature9']
  32. vector_assembler = VectorAssembler().setInputCols(feat_cols).setOutputCol("features")
  33. #### 训练 ####
  34. print("step 1")
  35. lr = LogisticRegression(regParam=0.01, maxIter=100) # regParam 正则项参数
  36. pipeline = Pipeline(stages=[vector_assembler, lr])
  37. model = pipeline.fit(train_df)
  38. # 打印参数
  39. print("\n-------------------------------------------------------------------------")
  40. print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")
  41. print("-------------------------------------------------------------------------\n")
  42. #### 预测, 保存结果 ####
  43. print("step 2")
  44. labels_and_preds = model.transform(test_df).withColumn("probability_xj", to_array(col("probability"))[1]) \
  45. .select("uuid", "label", "prediction", "probability_xj")
  46. return [labels_and_preds]
  47. if __name__ == '__main__':
  48. inputs_str = sys.argv[1]
  49. outputs_str = sys.argv[2]
  50. run(inputs=json.loads(inputs_str), outputs=json.loads(outputs_str))