当前位置:网站首页>Spark SQL空值Null,NaN判断和处理

Spark SQL空值Null,NaN判断和处理

2022-07-06 00:23:00 南风知我意丿

Spark SQL空值Null,NaN判断和处理

Null 和 NaN

null表示无、不存在或无效的对象或地址引用。它在简单的数学运算中会转换为0,它是一个全局对象。null ==false返回的值是false。
undefined是一个全局属性,原始值undefined。它告诉我们有些东西没有赋值,没有定义。undefined不能转换成任何数字,因此在数学计算中使用它,返回的是NaN。

	val d: Double = math.sqrt(-1.0)
    println(d)
	
    val n: Boolean = math.sqrt(-1.0).isNaN
    println(n)

在这里插入图片描述

Spark SQL空值Null,NaN判断和处理

    val df: DataFrame = session.sql(
      s""" |select * from sparktuning.course_pay1 |""".stripMargin)
         
 // 删除所有列的空值和NaN
val resNull=data1.na.drop()    
 resNull.limit(10).show()
+-------+------+---+------------+--------+-------------+---------+----------+------+
|affairs|gender|age|yearsmarried|children|religiousness|education|occupation|rating|
+-------+------+---+------------+--------+-------------+---------+----------+------+
|      0|  male| 37|          10|      no|            3|       18|         7|     4|
|      0|  male| 57|          15|     yes|            2|       14|         4|     4|
|      0|female| 32|          15|     yes|            4|       16|         1|     2|
|      0|  male| 22|         1.5|      no|            4|       14|         4|     5|
|      0|  male| 37|          15|     yes|            2|       20|         7|     2|
|      0|  male| 27|           4|     yes|            4|       18|         6|     4|
|      0|  male| 47|          15|     yes|            5|       17|         6|     4|
|      0|female| 22|         1.5|      no|            2|       17|         5|     4|
|      0|female| 27|           4|      no|            4|       14|         5|     4|
|      0|female| 37|          15|     yes|            1|       17|         5|     5|
+-------+------+---+------------+--------+-------------+---------+----------+------+
    
//删除某列的空值和NaN
val res=data1.na.drop(Array("gender","yearsmarried"))
 
// 删除某列的非空且非NaN的低于10的 --注意字段类型
data1.na.drop(10,Array("gender","yearsmarried"))
    
//填充所有空值[Boolean]的列 --注意字段类型
df.na.fill(false,Array("courseid")) 
   
//填充所有空值的列
val res123=data1.na.fill("wangxiao123")
 res123.limit(10).show()
+-------+-----------+---+------------+--------+-------------+---------+----------+-----------+
|affairs|     gender|age|yearsmarried|children|religiousness|education|occupation|     rating|
+-------+-----------+---+------------+--------+-------------+---------+----------+-----------+
|      0|       male| 37|          10|      no|            3|       18|         7|          4|
|      0|wangxiao123| 27| wangxiao123|      no|            4|       14|         6|wangxiao123|
|      0|wangxiao123| 32| wangxiao123|     yes|            1|       12|         1|wangxiao123|
|      0|wangxiao123| 57| wangxiao123|     yes|            5|       18|         6|wangxiao123|
|      0|wangxiao123| 22| wangxiao123|      no|            2|       17|         6|wangxiao123|
|      0|wangxiao123| 32| wangxiao123|      no|            2|       17|         5|wangxiao123|
|      0|     female| 22| wangxiao123|      no|            2|       12|         1|wangxiao123|
|      0|       male| 57|          15|     yes|            2|       14|         4|          4|
|      0|     female| 32|          15|     yes|            4|       16|         1|          2|
|      0|       male| 22|         1.5|      no|            4|       14|         4|          5|
+-------+-----------+---+------------+--------+-------------+---------+----------+-----------+
    
//对指定列的控制进行填充 -- 多列同值
df1.na.fill(123456,cols = Array("courseid","pointlistid")).show(false)

+---------+--------+-------+-----+-----------+--------+----+
|chapterid|courseid|majorid|money|pointlistid|dt      |dn  |
+---------+--------+-------+-----+-----------+--------+----+
|4        |123456  |5      |100  |3          |20190722|webA|
|7        |123456  |7      |100  |1          |20190722|webA|
|8        |123456  |3      |     |8          |20190722|webA|
|5        |14      |3      |100  |123456     |20190722|webA|
|4        |15      |2      |100  |3          |20190722|webA|
|9        |123456  |8      |100  |7          |20190722|webA|
|7        |17      |7      |100  |123456     |20190722|webA|
|0        |18      |9      |     |7          |20190722|webA|
|5        |123456  |8      |100  |4          |20190722|webA|
|4        |20      |1      |100  |123456     |20190722|webA|
|4        |123456  |5      |100  |1          |20190722|webA|
|0        |22      |3      |100  |9          |20190722|webA|
|1        |123456  |8      |100  |0          |20190722|webA|
|4        |24      |0      |100  |5          |20190722|webA|
|9        |123456  |9      |100  |0          |20190722|webA|
+---------+--------+-------+-----+-----------+--------+----+

//对指定列的控制进行填充 -- 多列不同值
df1.na.fill(Map("courseid"->123456,"pointlistid"->654321)).show(false)
+---------+--------+-------+-----+-----------+--------+----+
|chapterid|courseid|majorid|money|pointlistid|dt      |dn  |
+---------+--------+-------+-----+-----------+--------+----+
|4        |123456  |5      |100  |3          |20190722|webA|
|7        |123456  |7      |100  |1          |20190722|webA|
|8        |123456  |3      |     |8          |20190722|webA|
|5        |14      |3      |100  |654321     |20190722|webA|
|4        |15      |2      |100  |3          |20190722|webA|
|9        |123456  |8      |100  |7          |20190722|webA|
|7        |17      |7      |100  |654321     |20190722|webA|
|0        |18      |9      |     |7          |20190722|webA|
|5        |123456  |8      |100  |4          |20190722|webA|
|4        |20      |1      |100  |654321     |20190722|webA|
|4        |123456  |5      |100  |1          |20190722|webA|
|0        |22      |3      |100  |9          |20190722|webA|
|1        |123456  |8      |100  |0          |20190722|webA|
|4        |24      |0      |100  |5          |20190722|webA|
|9        |123456  |9      |100  |0          |20190722|webA|
+---------+--------+-------+-----+-----------+--------+----+


//查询空值列
data1.filter("gender is null").select("gender").limit(10).show
+------+
|gender|
+------+
|  null|
|  null|
|  null|
|  null|
|  null|
+------+
    
    
 data1.filter("gender is not null").select("gender").limit(10).show
+------+
|gender|
+------+
|  male|
|female|
|  male|
|female|
|  male|
|  male|
|  male|
|  male|
|female|
|female|
+------+
    
    
 data1.filter( data1("gender").isNull ).select("gender").limit(10).show
+------+
|gender|
+------+
|  null|
|  null|
|  null|
|  null|
|  null|
+------+
    
    
 data1.filter("gender<>''").select("gender").limit(10).show
+------+
|gender|
+------+
|  male|
|female|
|  male|
|female|
|  male|
|  male|
|  male|
|  male|
|female|
|female|
+------+
原网站

版权声明
本文为[南风知我意丿]所创,转载请带上原文链接,感谢
https://blog.csdn.net/Lzx116/article/details/125615901