spark-submit
spark-submit --master yarn --deploy-mode cluster --queue q1 --num-executors 1 scripy.py
pyspark
def process(rows):
content = ""
for row in rows:
content += b64encode(row.url)
return [content]
conf = SparkConf().setAppName('PoliceHive2Xml')
spark_context = SparkContext(conf=conf)
hive_context = HiveContext(spark_context)
sql = "select * from table where dayno=20170807 limit 1000"
data_frame = hive_context.sql(sql)
hdfs_filepath = get_hdfs_filepath(table_name, zip_file_name)
data_frame.mapPartitions(process).saveAsTextFile(hdfs_filepath)