SimpleDao
spark 提交任务 python pyspark
2019-05-06, 访问数: 1116

spark-submit

  1. spark-submit --master yarn --deploy-mode cluster --queue q1 --num-executors 1 scripy.py

pyspark

  1. def process(rows):
  2. content = ""
  3. for row in rows:
  4. content += b64encode(row.url)
  5. return [content]
  6. conf = SparkConf().setAppName('PoliceHive2Xml')
  7. spark_context = SparkContext(conf=conf)
  8. hive_context = HiveContext(spark_context)
  9. sql = "select * from table where dayno=20170807 limit 1000"
  10. data_frame = hive_context.sql(sql)
  11. hdfs_filepath = get_hdfs_filepath(table_name, zip_file_name)
  12. data_frame.mapPartitions(process).saveAsTextFile(hdfs_filepath)