当前位置：网站首页>Spark: calculate the average value of the same key in different partitions (entry level - simple implementation)

Spark: calculate the average value of the same key in different partitions (entry level - simple implementation)

2022-07-27 03:24:00 【One's cow】

Calculate the same in different partitions key Average value .

aggregateByKey Realization 、combineByKey Realization .

aggregateByKey

import org.apache.spark.{SparkConf, SparkContext}

object RDD_Operator_Transform_aggregateByKey {
  def main(args: Array[String]): Unit = {

    //TODO  Create an environment 
    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("Operator")
    val sc = new SparkContext(sparkConf)

    //TODO RDD operator ——key-value——aggregateByKey
    // Calculate the same in different partitions key Average value 
    val rdd = sc.makeRDD(List(
      ("A", 1), ("B", 2), ("C", 3), ("D", 4), ("A", 5), ("B", 6), ("C", 7), ("A", 8)
    ), 2)
    val newRDD = rdd.aggregateByKey((0, 0))(
      (t, v) => {
        (t._1 + v, t._2 + 1)
      },
      (t1, t2) => {
        (t1._1 + t2._1, t1._2 + t2._2)
      }
    )
    val RDD1 = newRDD.mapValues {
      case (num, c) => {
        num / c
      }
    }
    RDD1.collect().foreach(println)

    //TODO  Shut down the environment 
    sc.stop()
  }
}

combineByKey

import org.apache.spark.{SparkConf, SparkContext}

object RDD_Operator_Transform_combineByKey {
  def main(args: Array[String]): Unit = {

    //TODO  Create an environment 
    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("Operator")
    val sc = new SparkContext(sparkConf)

    //TODO RDD operator ——key-value——combineByKey
    // Calculate the same in different partitions key Average value 
    val rdd = sc.makeRDD(List(
      ("A", 1), ("B", 2), ("C", 3), ("D", 4), ("A", 5), ("B", 6), ("C", 7), ("A", 8)
    ), 2)
    // first ： Will be the same key The first data structure transformation 
    // the second ： Calculation rules within a partition 
    // Third ： Calculation rules between partitions 
    val newRDD = rdd.combineByKey(
      v => (v, 1),
      (t: (Int, Int), v) => {
        (t._1 + v, t._2 + 1)
      },
      (t1: (Int, Int), t2: (Int, Int)) => {
        (t1._1 + t2._1, t1._2 + t2._2)
      }
    )
    val RDD1 = newRDD.mapValues {
      case (num, c) => {
        num / c
      }
    }
    RDD1.collect().foreach(println)

    //TODO  Shut down the environment 
    sc.stop()
  }
}