2016-11-09 44 views
0

这里我内流,从目录,并将其写入到输出流的位置数据。我也试图实现将hdfs文件从输入文件夹移动到流式目录的过程。此流程在流式上下文开始之前发生一次。但是我希望这个举动能够在每一批Dstream中得到执行。这甚至有可能吗?如何移动文件我的火花流应用程序

val streamed_rdd = ssc.fileStream[LongWritable, Text, TextInputFormat](streaming_directory, (t:Path)=> true , true).map { case (x, y) => (y.toString) } 
    streamed_rdd.foreachRDD(rdd => { 
     rdd.map(x =>x.split("\t")).map(x => x(3)).foreachPartition { partitionOfRecords => 
     val connection: Connection = connectionFactory.createConnection() 
     connection.setClientID("Email_send_module_client_id") 
     println("connection started with active mq") 
     val session: Session = connection.createSession(false, Session.AUTO_ACKNOWLEDGE) 
     println("created session") 
     val dest = session.createQueue("dwEmailsQueue2") 
     println("destination queue name = dwEmailsQueue2") 
     val prod_queue = session.createProducer(dest) 
     connection.start() 
     partitionOfRecords.foreach { record => 
      val rec_to_send: TextMessage = session.createTextMessage(record) 
      println("started creating a text message") 
      prod_queue.send(rec_to_send) 
      println("sent the record") 
     } 
     connection.close() 
     } 
    } 
    ) 
    **val LIST = scala.collection.mutable.MutableList[String]() 
    val files_to_move = scala.collection.mutable.MutableList[String]() 
    val cmd = "hdfs dfs -ls -d "+load_directory+"/*" 
    println(cmd) 
    val system_time = System.currentTimeMillis 
    println(system_time) 
    val output = cmd.!! 
    output.split("\n").foreach(x => x.split(" ").foreach(x => if (x.startsWith("/user/hdpprod/")) LIST += x)) 
    LIST.foreach(x => if (x.toString.split("/").last.split("_").last.toLong < system_time) files_to_move += x) 
    println("files to move" +files_to_move) 
    var mv_cmd :String = "hdfs dfs -mv " 
    for (file <- files_to_move){ 
     mv_cmd += file+" " 
    } 
    mv_cmd += streaming_directory 
    println(mv_cmd) 
    val mv_output = mv_cmd.!! 
    println("moved the data to the folder")** 
    if (streamed_rdd.count().toString == "0") { 
     println("no data in the streamed list") 
    } else { 
     println("saving the Dstream at "+System.currentTimeMillis()) 
     streamed_rdd.transform(rdd => {rdd.map(x => (check_time_to_send+"\t"+check_time_to_send_utc+"\t"+x))}).saveAsTextFiles("/user/hdpprod/temp/spark_streaming_output_sent/sent") 
    } 
    ssc.start() 
    ssc.awaitTermination() 
    } 
} 

回答

0

我试着在java实现中做同样的东西,如下所示。你可以从foreachPartion在RDD调用此方法

public static void moveFiles(final String moveFilePath, 
      final JavaRDD rdd) { 
      for (final Partition partition : rdd.partitions()) { 
       final UnionPartition unionPartition = (UnionPartition) partition; 
       final NewHadoopPartition newHadoopPartition = (NewHadoopPartition) 
        unionPartition.parentPartition(); 
       final String fPath = newHadoopPartition.serializableHadoopSplit() 
        .value().toString(); 
       final String[] filespaths = fPath.split(":"); 

       if ((filespaths != null) && (filespaths.length > 0)) { 
        for (final String filepath : filespaths) { 
         if ((filepath != null) && filepath.contains("/")) { 
          final File file = new File(filepath); 

          if (file.exists() && file.isFile()) { 
           try { 
            File destFile = new File(moveFilePath + "/" + 
              file.getName()); 

            if (destFile.exists()) { 
             destFile = new File(moveFilePath + "/" + 
               file.getName() + "_"); 
            } 

            java.nio.file.Files.move((file 
              .toPath()), destFile.toPath(), 
             StandardCopyOption.REPLACE_EXISTING); 


           } catch (Exception e) { 
            logger.error(
             "Exception while moving file", 
             e); 
           } 
          } 
         } 
        } 
       } 
      } 

     }