val rdd = sc.parallelize(List( 1 , 2 , 3 , 3 )) // collect:返回RDD中的所有元素 rdd.collect() // count:返回RDD中的元素个数 rdd.count() // countByValue:返回个元素在RDD中出现的次数 rdd.countByValue() // take:从RDD中返回2个元素 rdd.take( 2 ) // top:从RDD中返回最前面的2个元素 val x = rdd.top( 2 ) // takeOrdered:从RDD中按照提供的顺序返回最前面的2个元素 rdd.takeOrdered( 2 ) object Ord extends Ordering[Int] { override def compare(x: Int, y: Int): Int = { if (x < y) 1 else - 1 ; } } val pa = sc.parallelize(Array( 1 , 2 , 3 , 4 , 5 , 6 )) pa.takeOrdered( 3 )(Ord) // foreach(func):对RDD中的每个元素使用给定的函数 rdd.foreach(println) 结果: scala> val rdd = sc.parallelize(List( 1 , 2 , 3 , 3 )) rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[ 37 ] at parallelize at <console>: 24 scala> rdd.collect() res18: Array[Int] = Array( 1 , 2 , 3 , 3 ) scala> rdd.count() res19: Long = 4 scala> rdd.countByValue() res20: scala.collection.Map[Int,Long] = Map( 1 -> 1 , 2 -> 1 , 3 -> 2 ) scala> rdd.take( 2 ) res21: Array[Int] = Array( 1 , 2 ) scala> val x = rdd.top( 2 ) x: Array[Int] = Array( 3 , 3 ) scala> rdd.takeOrdered( 2 ) res22: Array[Int] = Array( 1 , 2 ) scala> object Ord extends Ordering[Int] { | override def compare(x: Int, y: Int): Int = { | if (x < y) 1 else - 1 ; | } | } defined object Ord scala> val pa = sc.parallelize(Array( 1 , 2 , 3 , 4 , 5 , 6 )) pa: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[ 43 ] at parallelize at <console>: 24 scala> pa.takeOrdered( 3 )(Ord) res23: Array[Int] = Array( 6 , 5 , 4 ) scala> rdd.foreach(println) 2 1 3 3 |