当前位置：网站首页>Usage of collect_list in Scala105-Spark.sql

Usage of collect_list in Scala105-Spark.sql

2022-08-04 18:33:00 【51CTO】

       
       import 
       
       org.
       
       apache.
       
       spark.
       
       sql.
       
       functions.
       
       _
       
       import 
       
       spark.
       
       implicits.
       
       _
       
       import 
       
       org.
       
       apache.
       
       spark.
       
       ml.
       
       feature.
       
       VectorAssembler
       
       import 
       
       org.
       
       apache.
       
       spark.
       
       ml.
       
       linalg.{
       
       Vector, 
       
       Vectors}
       
       import 
       
       org.
       
       apache.
       
       spark.
       
       sql.{
       
       DataFrame, 
       
       Row, 
       
       SparkSession}
      
1.
2.
3.
4.
5.

       
       import org.apache.spark.sql.functions._
       
import spark.implicits._
       
import org.apache.spark.ml.feature.VectorAssembler
       
import org.apache.spark.ml.linalg.{Vector, Vectors}
       
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
      
1.
2.
3.
4.
5.

       
       val 
       
       builder 
       
       = 
       
       SparkSession
       
      .
       
       builder()
       
      .
       
       appName(
       
       "learningScala")
       
      .
       
       config(
       
       "spark.executor.heartbeatInterval",
       
       "60s")
       
      .
       
       config(
       
       "spark.network.timeout",
       
       "120s")
       
      .
       
       config(
       
       "spark.serializer", 
       
       "org.apache.spark.serializer.KryoSerializer")
       
      .
       
       config(
       
       "spark.kryoserializer.buffer.max",
       
       "512m")
       
      .
       
       config(
       
       "spark.dynamicAllocation.enabled", 
       
       false)
       
      .
       
       config(
       
       "spark.sql.inMemoryColumnarStorage.compressed", 
       
       true)
       
      .
       
       config(
       
       "spark.sql.inMemoryColumnarStorage.batchSize", 
       
       10000)
       
      .
       
       config(
       
       "spark.sql.broadcastTimeout", 
       
       600)
       
      .
       
       config(
       
       "spark.sql.autoBroadcastJoinThreshold", 
       
       -
       
       1)
       
      .
       
       config(
       
       "spark.sql.crossJoin.enabled", 
       
       true)
       
      .
       
       master(
       
       "local[*]") 
       
       val 
       
       spark 
       
       = 
       
       builder.
       
       getOrCreate()
       
       spark.
       
       sparkContext.
       
       setLogLevel(
       
       "ERROR")
      
1.
2.
3.
4.
5.
6.
7.
8.
9.
10.
11.
12.
13.
14.
15.
16.

       
       builder: org.apache.spark.sql.SparkSession.Builder = [email protected]
       
spark: org.apache.spark.sql.SparkSession = [email protected]
      
1.
2.

       
       var 
       
       df1 
       
       = 
       
       Seq(
       
(
       
       1,
       
       "2019-04-01 11:45:50",
       
       11.15,
       
       "2019-04-02 11:45:49"),
       
(
       
       2,
       
       "2019-05-02 11:56:50",
       
       10.37,
       
       "2019-05-02 11:56:51"),
       
(
       
       3,
       
       "2019-07-21 12:45:50",
       
       12.11,
       
       "2019-08-21 12:45:50"),
       
(
       
       2,
       
       "2019-08-01 12:40:50",
       
       14.50,
       
       "2020-08-03 12:40:50"),
       
(
       
       5,
       
       "2019-01-06 10:00:50",
       
       16.39,
       
       "2019-01-05 10:00:50")
       
).
       
       toDF(
       
       "id",
       
       "startTimeStr", 
       
       "payamount",
       
       "endTimeStr")
       
       df1 
       
       = 
       
       df1.
       
       withColumn(
       
       "startTime",
       
       $
       
       "startTimeStr".
       
       cast(
       
       "Timestamp"))
       
         .
       
       withColumn(
       
       "endTime",
       
       $
       
       "endTimeStr".
       
       cast(
       
       "Timestamp"))
       
       df1.
       
       printSchema
       
       df1.
       
       show()
      
1.
2.
3.
4.
5.
6.
7.
8.
9.
10.
11.

       
       root
       
 |-- id: integer (nullable = false)
       
 |-- startTimeStr: string (nullable = true)
       
 |-- payamount: double (nullable = false)
       
 |-- endTimeStr: string (nullable = true)
       
 |-- startTime: timestamp (nullable = true)
       
 |-- endTime: timestamp (nullable = true)
       
+---+-------------------+---------+-------------------+-------------------+-------------------+
       
| id|       startTimeStr|payamount|         endTimeStr|          startTime|            endTime|
       
+---+-------------------+---------+-------------------+-------------------+-------------------+
       
|  1|2019-04-01 11:45:50|    11.15|2019-04-02 11:45:49|2019-04-01 11:45:50|2019-04-02 11:45:49|
       
|  2|2019-05-02 11:56:50|    10.37|2019-05-02 11:56:51|2019-05-02 11:56:50|2019-05-02 11:56:51|
       
|  3|2019-07-21 12:45:50|    12.11|2019-08-21 12:45:50|2019-07-21 12:45:50|2019-08-21 12:45:50|
       
|  2|2019-08-01 12:40:50|     14.5|2020-08-03 12:40:50|2019-08-01 12:40:50|2020-08-03 12:40:50|
       
|  5|2019-01-06 10:00:50|    16.39|2019-01-05 10:00:50|2019-01-06 10:00:50|2019-01-05 10:00:50|
       
+---+-------------------+---------+-------------------+-------------------+-------------------+
       
df1: org.apache.spark.sql.DataFrame = [id: int, startTimeStr: string ... 4 more fields]
       
df1: org.apache.spark.sql.DataFrame = [id: int, startTimeStr: string ... 4 more fields]
      
1.
2.
3.
4.
5.
6.
7.
8.
9.
10.
11.
12.
13.
14.
15.
16.
17.
18.
19.
20.
21.
22.
23.
24.
25.

       
       df1.
       
       createOrReplaceTempView(
       
       "temp1")
      
1.

       
       val 
       
       sql 
       
       = 
       
       s
       
       """
       
       SELECT *,
       
       collect_list(payamount) over(partition BY id ORDER BY startTimeStr) payamount_array
       
       FROM temp1
       
       """
      
1.
2.
3.
4.
5.

       
       sql: String =
       
"
       
SELECT *,
       
collect_list(payamount) over(partition BY id ORDER BY startTimeStr) payamount_array
       
FROM temp1
       
"
      
1.
2.
3.
4.
5.
6.

       
       val 
       
       dfCollect 
       
       = 
       
       spark.
       
       sql(
       
       sql)
      
1.

       
       dfCollect: org.apache.spark.sql.DataFrame = [id: int, startTimeStr: string ... 5 more fields]
      
1.

       
       dfCollect.
       
       show()
      
1.

       
       +---+-------------------+---------+-------------------+-------------------+-------------------+---------------+
       
| id|       startTimeStr|payamount|         endTimeStr|          startTime|            endTime|payamount_array|
       
+---+-------------------+---------+-------------------+-------------------+-------------------+---------------+
       
|  1|2019-04-01 11:45:50|    11.15|2019-04-02 11:45:49|2019-04-01 11:45:50|2019-04-02 11:45:49|        [11.15]|
       
|  3|2019-07-21 12:45:50|    12.11|2019-08-21 12:45:50|2019-07-21 12:45:50|2019-08-21 12:45:50|        [12.11]|
       
|  5|2019-01-06 10:00:50|    16.39|2019-01-05 10:00:50|2019-01-06 10:00:50|2019-01-05 10:00:50|        [16.39]|
       
|  2|2019-05-02 11:56:50|    10.37|2019-05-02 11:56:51|2019-05-02 11:56:50|2019-05-02 11:56:51|        [10.37]|
       
|  2|2019-08-01 12:40:50|     14.5|2020-08-03 12:40:50|2019-08-01 12:40:50|2020-08-03 12:40:50|  [10.37, 14.5]|
       
+---+-------------------+---------+-------------------+-------------------+-------------------+---------------+
      
1.
2.
3.
4.
5.
6.
7.
8.
9.

SQL的基础逻辑,按照id分组,组内按照startTimeStr排序,拼接payamount组成array,array中元素排序,按照startTimeStr升序排列

2020-05-28 于南京市江宁区九龙湖

原网站

版权声明
本文为[51CTO]所创，转载请带上原文链接，感谢
https://yzsam.com/2022/216/202208041827407218.html

当前位置：网站首页>Usage of collect_list in Scala105-Spark.sql

Usage of collect_list in Scala105-Spark.sql

边栏推荐

猜你喜欢

随机推荐