Apache Spark hints

scala> val data = sc.textFile(“hdfs://path/to/file”)

scala> data.foreach(println) // print all lines from file

scala> def myPrint (a: String) : Unit = {println(a))

scala> data.foreach(a => myPrint(a)) // prints all lines from file using myPrint function


scala> case class EmailRow(row:String) // create class for row

scala> val df=data.map(x => EmailRow(x) ).toDF() // Create dataframe

// show dataframe

scala> df.show()

scala> df.select(“row”).show()



// Create unique id column for dataset

scala> import org.apache.spark.sql.functions.monotonicallyIncreasingId

scala> val newDf = df.withColumn(“id”, monotonicallyIncreasingId) // adds a new columnt at the end of the current dataset

scala> val ds2 = newDf.select(“id”,”row”) // now id is the first columnt

scala> ds2.select(“id”, “row”).where(df(“row”).contains(“X-“)).show() //filter out smth and show it

scala> ds2.count() // how many lines do I have in my dataset


val text_file = sc.textFile(“hdfs://bigdata21.webmedia.int:8020/user/margusja/titanic_test.csv”)
// word (as a key), 1

case class Person(name: String, age: String)
val people = text_file.map(_.split(“,”)).map(p => Person(p(2), p(5))).toDS().toDF()

// Age is String and contains empty fields. Lets filter out numerical values
people.filter($”age” > 0).select(people(“age”).cast(“int”)).show()

// lets take avarage of people age
people.filter($”age” > 0).select(avg((people(“age”).cast(“int”)))).show()

Posted in IT