当前位置:网站首页>Scala104 - Built-in datetime functions for Spark.sql
Scala104 - Built-in datetime functions for Spark.sql
2022-08-04 18:32:00 【51CTO】
Sometimes we use it directlydf.createOrReplaceTempView(temp)创建临时表,用sql去计算.sparkSQL有些语法和hql不一样,做个笔记.
- <scala.version>2.11.12</scala.version>
- <spark.version>2.4.3</spark.version>
val
builder
=
SparkSession
.
builder()
.
appName(
"learningScala")
.
config(
"spark.executor.heartbeatInterval",
"60s")
.
config(
"spark.network.timeout",
"120s")
.
config(
"spark.serializer",
"org.apache.spark.serializer.KryoSerializer")
.
config(
"spark.kryoserializer.buffer.max",
"512m")
.
config(
"spark.dynamicAllocation.enabled",
false)
.
config(
"spark.sql.inMemoryColumnarStorage.compressed",
true)
.
config(
"spark.sql.inMemoryColumnarStorage.batchSize",
10000)
.
config(
"spark.sql.broadcastTimeout",
600)
.
config(
"spark.sql.autoBroadcastJoinThreshold",
-
1)
.
config(
"spark.sql.crossJoin.enabled",
true)
.
master(
"local[*]")
val
spark
=
builder.
getOrCreate()
spark.
sparkContext.
setLogLevel(
"ERROR")
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
builder: org.apache.spark.sql.SparkSession.Builder = [email protected]
spark: org.apache.spark.sql.SparkSession = [email protected]
- 1.
- 2.
var
df1
=
Seq(
(
1,
"2019-04-01 11:45:50",
11.15,
"2019-04-02 11:45:49"),
(
2,
"2019-05-02 11:56:50",
10.37,
"2019-05-02 11:56:51"),
(
3,
"2019-07-21 12:45:50",
12.11,
"2019-08-21 12:45:50"),
(
4,
"2019-08-01 12:40:50",
14.50,
"2020-08-03 12:40:50"),
(
5,
"2019-01-06 10:00:50",
16.39,
"2019-01-05 10:00:50")
).
toDF(
"id",
"startTimeStr",
"payamount",
"endTimeStr")
df1
=
df1.
withColumn(
"startTime",
$
"startTimeStr".
cast(
"Timestamp"))
.
withColumn(
"endTime",
$
"endTimeStr".
cast(
"Timestamp"))
df1.
printSchema
df1.
show()
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
root
|-- id: integer (nullable = false)
|-- startTimeStr: string (nullable = true)
|-- payamount: double (nullable = false)
|-- endTimeStr: string (nullable = true)
|-- startTime: timestamp (nullable = true)
|-- endTime: timestamp (nullable = true)
+---+-------------------+---------+-------------------+-------------------+-------------------+
| id| startTimeStr|payamount| endTimeStr| startTime| endTime|
+---+-------------------+---------+-------------------+-------------------+-------------------+
| 1|2019-04-01 11:45:50| 11.15|2019-04-02 11:45:49|2019-04-01 11:45:50|2019-04-02 11:45:49|
| 2|2019-05-02 11:56:50| 10.37|2019-05-02 11:56:51|2019-05-02 11:56:50|2019-05-02 11:56:51|
| 3|2019-07-21 12:45:50| 12.11|2019-08-21 12:45:50|2019-07-21 12:45:50|2019-08-21 12:45:50|
| 4|2019-08-01 12:40:50| 14.5|2020-08-03 12:40:50|2019-08-01 12:40:50|2020-08-03 12:40:50|
| 5|2019-01-06 10:00:50| 16.39|2019-01-05 10:00:50|2019-01-06 10:00:50|2019-01-05 10:00:50|
+---+-------------------+---------+-------------------+-------------------+-------------------+
df1: org.apache.spark.sql.DataFrame = [id: int, startTimeStr: string ... 4 more fields]
df1: org.apache.spark.sql.DataFrame = [id: int, startTimeStr: string ... 4 more fields]
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
timestamp转string
把timestampConvert to the corresponding format string
- date_format把timestamp转换成对应的字符串
- String format is used"yyyyMMdd"表示
root
|-- yyyyMMdd: string (nullable = true)
|-- yyyy_MM_dd: string (nullable = true)
|-- yyyy: string (nullable = true)
+--------+----------+----+
|yyyyMMdd|yyyy_MM_dd|yyyy|
+--------+----------+----+
|20190401|2019-04-01|2019|
|20190502|2019-05-02|2019|
|20190721|2019-07-21|2019|
|20190801|2019-08-01|2019|
|20190106|2019-01-06|2019|
+--------+----------+----+
sql: String =
"
SELECT date_format(startTime,'yyyyMMdd') AS yyyyMMdd,
date_format(startTime,'yyyy-MM-dd') AS yyyy_MM_dd,
date_format(startTime,'yyyy') AS yyyy
FROM TEMP
"
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
timestamp转date
- to_date可以把timestamp转换成date类型
root
|-- startTime: timestamp (nullable = true)
|-- endTime: timestamp (nullable = true)
|-- startDate: date (nullable = true)
|-- endDate: date (nullable = true)
+-------------------+-------------------+----------+----------+
| startTime| endTime| startDate| endDate|
+-------------------+-------------------+----------+----------+
|2019-04-01 11:45:50|2019-04-02 11:45:49|2019-04-01|2019-04-02|
|2019-05-02 11:56:50|2019-05-02 11:56:51|2019-05-02|2019-05-02|
|2019-07-21 12:45:50|2019-08-21 12:45:50|2019-07-21|2019-08-21|
|2019-08-01 12:40:50|2020-08-03 12:40:50|2019-08-01|2020-08-03|
|2019-01-06 10:00:50|2019-01-05 10:00:50|2019-01-06|2019-01-05|
+-------------------+-------------------+----------+----------+
sql: String =
SELECT startTime,endTime,
to_date(startTime) AS startDate,
to_date(endTime) AS endDate
FROM TEMP
df2: org.apache.spark.sql.DataFrame = [startTime: timestamp, endTime: timestamp ... 2 more fields]
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
求时间差
- Day difference functiondatediff可以应用在timestamp中,Can also be applied in date类型中,The unit is natural days,而不是24小时
- month difference functionmonths_between同样可以,The monthly unit does not seem to be fixed,即31天or30天
df2.
createOrReplaceTempView(
"temp")
var
sql
=
"""
SELECT startTime,
endTime,
datediff(endTime,startTime) AS dayInterval1,
datediff(endDate,startDate) AS dayInterval2,
months_between(endTime,startTime) AS monthInterval1,
months_between(endDate,startDate) AS monthInterval2
FROM TEMP
"""
// spark.sql(sql).printSchema
spark.
sql(
sql).
show()
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
+-------------------+-------------------+------------+------------+--------------+--------------+
| startTime| endTime|dayInterval1|dayInterval2|monthInterval1|monthInterval2|
+-------------------+-------------------+------------+------------+--------------+--------------+
|2019-04-01 11:45:50|2019-04-02 11:45:49| 1| 1| 0.03225769| 0.03225806|
|2019-05-02 11:56:50|2019-05-02 11:56:51| 0| 0| 0.0| 0.0|
|2019-07-21 12:45:50|2019-08-21 12:45:50| 31| 31| 1.0| 1.0|
|2019-08-01 12:40:50|2020-08-03 12:40:50| 368| 368| 12.06451613| 12.06451613|
|2019-01-06 10:00:50|2019-01-05 10:00:50| -1| -1| -0.03225806| -0.03225806|
+-------------------+-------------------+------------+------------+--------------+--------------+
sql: String =
"
SELECT startTime,
endTime,
datediff(endTime,startTime) AS dayInterval1,
datediff(endDate,startDate) AS dayInterval2,
months_between(endTime,startTime) AS monthInterval1,
months_between(endDate,startDate) AS monthInterval2
FROM TEMP
"
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
Ref
2020-03-24 于南京市江宁区九龙湖
边栏推荐
- 【web自动化测试】Playwright快速入门,5分钟上手
- 悦刻难回巅峰
- Speech Recognition Learning Resources
- Google AppSheet: 无需编程构建零代码应用
- 通俗易懂-二维数组只能省略行不能省略列-人话版本
- margin 塌陷和重合的理解
- 2022 May 1 Mathematical Modeling Question C Explanation
- Go language Go language, understand Go language file operation in one article
- (ECCV-2022)GaitEdge:超越普通的端到端步态识别,提高实用性
- 情绪的波动起伏
猜你喜欢
随机推荐
防火墙基础之防火墙做出口设备安全防护
The Industrial Metaverse Brings Changes to Industry
Iptables防火墙基础知识介绍
火灾报警联网FC18中CAN光端机常见问题解答和使用指导
链表的经典入门LeetCode题目
Investigation and Research Based on the Involution Behavior of College Students
PHP代码审计7—文件上传漏洞
Google Earth Engine APP——一键在线查看全球1984-至今年的影像同时加载一个影像分析
YOLOv7-Pose尝鲜,基于YOLOv7的关键点模型测评
OpenInfra Days China 2022 | SelectDB to share with you the Apache Doris in Internet advertising business practices
路由技术
BigDecimal 使用注意!!“别踩坑”
GBase8s存储过程
leetcode 14. 最长公共前缀
Speech Recognition Learning Resources
Go language Go language, understand Go language file operation in one article
leetcode/有效的回文串,含有不需要判断回文的字符
如何给MySQL添加自定义语法 ?
网页端IM即时通讯开发:短轮询、长轮询、SSE、WebSocket
curl命令的那些事









