当前位置:网站首页>Scala104-Spark.sql的内置日期时间函数
Scala104-Spark.sql的内置日期时间函数
2022-08-04 18:29:00 【51CTO】
有些时候我们会直接用df.createOrReplaceTempView(temp)创建临时表,用sql去计算。sparkSQL有些语法和hql不一样,做个笔记。
- <scala.version>2.11.12</scala.version>
- <spark.version>2.4.3</spark.version>
val
builder
=
SparkSession
.
builder()
.
appName(
"learningScala")
.
config(
"spark.executor.heartbeatInterval",
"60s")
.
config(
"spark.network.timeout",
"120s")
.
config(
"spark.serializer",
"org.apache.spark.serializer.KryoSerializer")
.
config(
"spark.kryoserializer.buffer.max",
"512m")
.
config(
"spark.dynamicAllocation.enabled",
false)
.
config(
"spark.sql.inMemoryColumnarStorage.compressed",
true)
.
config(
"spark.sql.inMemoryColumnarStorage.batchSize",
10000)
.
config(
"spark.sql.broadcastTimeout",
600)
.
config(
"spark.sql.autoBroadcastJoinThreshold",
-
1)
.
config(
"spark.sql.crossJoin.enabled",
true)
.
master(
"local[*]")
val
spark
=
builder.
getOrCreate()
spark.
sparkContext.
setLogLevel(
"ERROR")
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
builder: org.apache.spark.sql.SparkSession.Builder = [email protected]
spark: org.apache.spark.sql.SparkSession = [email protected]
- 1.
- 2.
var
df1
=
Seq(
(
1,
"2019-04-01 11:45:50",
11.15,
"2019-04-02 11:45:49"),
(
2,
"2019-05-02 11:56:50",
10.37,
"2019-05-02 11:56:51"),
(
3,
"2019-07-21 12:45:50",
12.11,
"2019-08-21 12:45:50"),
(
4,
"2019-08-01 12:40:50",
14.50,
"2020-08-03 12:40:50"),
(
5,
"2019-01-06 10:00:50",
16.39,
"2019-01-05 10:00:50")
).
toDF(
"id",
"startTimeStr",
"payamount",
"endTimeStr")
df1
=
df1.
withColumn(
"startTime",
$
"startTimeStr".
cast(
"Timestamp"))
.
withColumn(
"endTime",
$
"endTimeStr".
cast(
"Timestamp"))
df1.
printSchema
df1.
show()
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
root
|-- id: integer (nullable = false)
|-- startTimeStr: string (nullable = true)
|-- payamount: double (nullable = false)
|-- endTimeStr: string (nullable = true)
|-- startTime: timestamp (nullable = true)
|-- endTime: timestamp (nullable = true)
+---+-------------------+---------+-------------------+-------------------+-------------------+
| id| startTimeStr|payamount| endTimeStr| startTime| endTime|
+---+-------------------+---------+-------------------+-------------------+-------------------+
| 1|2019-04-01 11:45:50| 11.15|2019-04-02 11:45:49|2019-04-01 11:45:50|2019-04-02 11:45:49|
| 2|2019-05-02 11:56:50| 10.37|2019-05-02 11:56:51|2019-05-02 11:56:50|2019-05-02 11:56:51|
| 3|2019-07-21 12:45:50| 12.11|2019-08-21 12:45:50|2019-07-21 12:45:50|2019-08-21 12:45:50|
| 4|2019-08-01 12:40:50| 14.5|2020-08-03 12:40:50|2019-08-01 12:40:50|2020-08-03 12:40:50|
| 5|2019-01-06 10:00:50| 16.39|2019-01-05 10:00:50|2019-01-06 10:00:50|2019-01-05 10:00:50|
+---+-------------------+---------+-------------------+-------------------+-------------------+
df1: org.apache.spark.sql.DataFrame = [id: int, startTimeStr: string ... 4 more fields]
df1: org.apache.spark.sql.DataFrame = [id: int, startTimeStr: string ... 4 more fields]
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
timestamp转string
把timestamp转换成对应格式字符串
- date_format把timestamp转换成对应的字符串
- 字符串格式用"yyyyMMdd"表示
root
|-- yyyyMMdd: string (nullable = true)
|-- yyyy_MM_dd: string (nullable = true)
|-- yyyy: string (nullable = true)
+--------+----------+----+
|yyyyMMdd|yyyy_MM_dd|yyyy|
+--------+----------+----+
|20190401|2019-04-01|2019|
|20190502|2019-05-02|2019|
|20190721|2019-07-21|2019|
|20190801|2019-08-01|2019|
|20190106|2019-01-06|2019|
+--------+----------+----+
sql: String =
"
SELECT date_format(startTime,'yyyyMMdd') AS yyyyMMdd,
date_format(startTime,'yyyy-MM-dd') AS yyyy_MM_dd,
date_format(startTime,'yyyy') AS yyyy
FROM TEMP
"
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
timestamp转date
- to_date可以把timestamp转换成date类型
root
|-- startTime: timestamp (nullable = true)
|-- endTime: timestamp (nullable = true)
|-- startDate: date (nullable = true)
|-- endDate: date (nullable = true)
+-------------------+-------------------+----------+----------+
| startTime| endTime| startDate| endDate|
+-------------------+-------------------+----------+----------+
|2019-04-01 11:45:50|2019-04-02 11:45:49|2019-04-01|2019-04-02|
|2019-05-02 11:56:50|2019-05-02 11:56:51|2019-05-02|2019-05-02|
|2019-07-21 12:45:50|2019-08-21 12:45:50|2019-07-21|2019-08-21|
|2019-08-01 12:40:50|2020-08-03 12:40:50|2019-08-01|2020-08-03|
|2019-01-06 10:00:50|2019-01-05 10:00:50|2019-01-06|2019-01-05|
+-------------------+-------------------+----------+----------+
sql: String =
SELECT startTime,endTime,
to_date(startTime) AS startDate,
to_date(endTime) AS endDate
FROM TEMP
df2: org.apache.spark.sql.DataFrame = [startTime: timestamp, endTime: timestamp ... 2 more fields]
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
求时间差
- 天数差函数datediff可以应用在timestamp中,也可应用在date类型中,单位是自然天,而不是24小时
- 月份差函数months_between同样可以,月度的单位好像是不固定的,即31天or30天
df2.
createOrReplaceTempView(
"temp")
var
sql
=
"""
SELECT startTime,
endTime,
datediff(endTime,startTime) AS dayInterval1,
datediff(endDate,startDate) AS dayInterval2,
months_between(endTime,startTime) AS monthInterval1,
months_between(endDate,startDate) AS monthInterval2
FROM TEMP
"""
// spark.sql(sql).printSchema
spark.
sql(
sql).
show()
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
+-------------------+-------------------+------------+------------+--------------+--------------+
| startTime| endTime|dayInterval1|dayInterval2|monthInterval1|monthInterval2|
+-------------------+-------------------+------------+------------+--------------+--------------+
|2019-04-01 11:45:50|2019-04-02 11:45:49| 1| 1| 0.03225769| 0.03225806|
|2019-05-02 11:56:50|2019-05-02 11:56:51| 0| 0| 0.0| 0.0|
|2019-07-21 12:45:50|2019-08-21 12:45:50| 31| 31| 1.0| 1.0|
|2019-08-01 12:40:50|2020-08-03 12:40:50| 368| 368| 12.06451613| 12.06451613|
|2019-01-06 10:00:50|2019-01-05 10:00:50| -1| -1| -0.03225806| -0.03225806|
+-------------------+-------------------+------------+------------+--------------+--------------+
sql: String =
"
SELECT startTime,
endTime,
datediff(endTime,startTime) AS dayInterval1,
datediff(endDate,startDate) AS dayInterval2,
months_between(endTime,startTime) AS monthInterval1,
months_between(endDate,startDate) AS monthInterval2
FROM TEMP
"
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
Ref
2020-03-24 于南京市江宁区九龙湖
边栏推荐
- YOLOv7-Pose尝鲜,基于YOLOv7的关键点模型测评
- 敏捷开发项目管理的一些心得
- 数仓相关,总结
- leetcode/有效的回文串,含有不需要判断回文的字符
- Develop those things: How to obtain the traffic statistics of the monitoring site through the EasyCVR platform?
- 【注册荣耀开发者】赢【荣耀70】手机
- 火灾报警联网FC18中CAN光端机常见问题解答和使用指导
- Win10只读文件夹怎么删除
- After EasyCVR is locally connected to the national standard device to map the public network, the local device cannot play and cascade the solution
- 入选爱分析·银行数字化厂商全景报告,网易数帆助力金融数字化场景落地
猜你喜欢

袋鼠云思枢:数驹DTengine,助力企业构建高效的流批一体数据湖计算平台

EasyCVR calls the cloud recording API and returns an error and no recording file is generated. What is the reason?

关于使用腾讯云HiFlow场景连接器每天提醒签到打卡

Flask框架实现注册加密功能详解【Flask企业课学习】

八猴渲染器是什么?它能干什么?八猴软件的界面讲解

图解LeetCode——899. 有序队列(难度:困难)

如何进行自动化测试?

OpenInfra Days China 2022 | SelectDB to share with you the Apache Doris in Internet advertising business practices

当项目中自动格式化插件Prettier和ESLint冲突报错时如何解决

npm配置国内镜像(淘宝镜像)
随机推荐
MMDetection 使用示例:从入门到出门
How does EasyCVR call the double-speed playback of device recording through the interface?
How does the intelligent video surveillance platform EasyCVR use the interface to export iframe addresses in batches?
【STM32】入门(五):串口TTL、RS232、RS485
limux入门3—磁盘与分区管理
Google Earth Engine APP——一键在线查看全球1984-至今年的影像同时加载一个影像分析
The Industrial Metaverse Brings Changes to Industry
Route lazy loading
基于 eBPF 的 Kubernetes 可观测实践
Google Earth Engine APP - one-click online viewing of global images from 1984 to this year and loading an image analysis at the same time
开篇-开启全新的.NET现代应用开发体验
Iptables防火墙基础知识介绍
EasyCVR如何通过接口调用设备录像的倍速回放?
如何封装 svg
FE01_OneHot-Scala Application
敏捷开发项目管理的一些心得
Kubernetes入门到精通- Operator 模式入门
dotnet core 使用 CoreRT 将程序编译为 Native 程序
链表的经典入门LeetCode题目
"No title"