2014-01-21 31 views
0

这应该是一个简单的问题,但我很努力。除了初始化从配置文件读入的“train_rows”和“cols”的参数值之外,我的代码中的所有内容都正在工作。如何在setup()中初始化实例变量?

我设置了日志记录以在setup()方法中显示“train_rows”和“cols”的值,并且这些值是正确的。但是,当我在map()方法中尝试同样的东西时,两个值都显示为0.我做错了什么?

import java.io.File; import java.io.IOException; import java.io.FileNotFoundException; import java.util.Scanner; import org.apache.log4j.Logger;

import org.apache.hadoop.conf.Configuration; 
import org.apache.hadoop.io.IntWritable; 
import org.apache.hadoop.io.LongWritable; 
import org.apache.hadoop.io.Text; 
import org.apache.hadoop.mapreduce.Mapper; 

public class KNNMapper extends Mapper<LongWritable, Text, IntWritable, IntWritable> { 
    private static final Logger sLogger = Logger.getLogger(KNNMapper.class); 
    private int[][] train_vals; 
    private int[] train_label_vals; 
    private int train_rows; 
    private int test_rows; 
    private int cols; 

    @Override 
    public void setup(Context context) throws IOException, InterruptedException { 
     Configuration conf = context.getConfiguration(); 

     train_rows = conf.getInt("rows", -1); 
     cols = conf.getInt("columns", -1); 

     //just changed this 
     //int[][] train_vals = new int[train_rows][cols]; 
     //int[] train_label_vals = new int[train_rows]; 

     train_vals = new int[train_rows][cols]; 
     train_label_vals = new int[train_rows]; 

     // read train csv, parse, and store into 2d int array 
     Scanner myScan; 
     try { 
      File trainfile = new File("train_sample.csv"); 
      if (!trainfile.exists()) { 
       throw new IllegalArgumentException("train file didn't load"); 
      } 
      myScan = new Scanner(trainfile); 

      //Set the delimiter used in file 
      myScan.useDelimiter("[,\r\n]+"); 

      //Get all tokens and store them in some data structure 
      //I am just printing them 

      for(int row = 0; row < train_rows; row++) { 
       for(int col = 0; col < cols; col++) { 
        train_vals[row][col] = Integer.parseInt(myScan.next().toString()); 
       } 
      } 

      myScan.close(); 

     } catch (FileNotFoundException e) { 
      System.out.print("Error: Train file execution did not work."); 
     } 

    // read train_labels csv, parse, and store into 2d int array 
     try { 
      File trainlabels = new File("train_labels.csv"); 
      if (!trainlabels.exists()) { 
       throw new IllegalArgumentException("train labels didn't load"); 
      } 

      myScan = new Scanner(trainlabels); 

      //Set the delimiter used in file 
      myScan.useDelimiter("[,\r\n]+"); 

      //Get all tokens and store them in some data structure 
      //I am just printing them 

      for(int row = 0; row < train_rows; row++) { 
        train_label_vals[row] = Integer.parseInt(myScan.next().toString()); 
        if(row < 10) { 
         System.out.println(train_label_vals[row]); 
        } 
      } 

      myScan.close(); 

     } catch (FileNotFoundException e) { 
      System.out.print("Error: Train Labels file not found."); 
     } 
    } 

    @Override 
    public void map(LongWritable key, Text value, Context context) 
     throws IOException, InterruptedException { 

     // setup() gave us train_vals & train_label_vals. 
     // Each line in map() represents a test observation. We iterate 
     // through every train_val row to find nearest L2 match, then 
     // return a key/value pair of <observation #, 

     // convert from Text to String 

     System.out.println("I'm in the map!"); 
     String line = value.toString(); 
     double distance; 
     double best_distance = Double.POSITIVE_INFINITY; 
     int col_num; 

     int best_digit = -1; 
     IntWritable rowId = null; 
     int i; 
     IntWritable rowNum; 
     String[] pixels; 

     System.out.println("Number of train rows:" + train_rows); 
     System.out.println("Number of columns:" + cols); 
     // comma delimited files, split on commas 
     // first we find the # of rows 

     pixels = line.split(","); 
     rowId = new IntWritable(Integer.parseInt(pixels[0])); 
     System.out.println("working on row " + rowId); 
     best_distance = Double.POSITIVE_INFINITY; 

     for (i = 0; i < train_rows; i++) { 
      distance = 0.0; 

      col_num = 0; 

      for (int j = 1; j < cols; j++) { 
       distance += (Integer.parseInt(pixels[j]) - train_vals[i][j-1])^2; 
      } 

      if (distance < best_distance) { 
       best_distance = distance; 
       best_digit = train_label_vals[i]; 
      } 
     } 
     System.out.println("And we're out of the loop baby yeah!"); 
     context.write(rowId, new IntWritable(best_digit)); 
     System.out.println("Mapper done!"); 
    } 
} 
+0

这些线示出了 '0'? System.out.println(“列车行数:”+ train_rows); System.out.println(“列数:”+ cols); –

+0

是否可以扫描文件train_sample.csv。 train_sample.csv位于哪里?在hdfs? –

+0

是这些行显示0.我能够扫描setup()中的文件,并且它们位于hdfs中。我使用了DistributedCache.createSymlink(conf);以允许更短的名称。 – user1956609

回答

0

我怀疑这个,假设你正在用hdfs扫描文件。

您曾使用过:

import java.io.File;文件trainfile =新文件(“train_sample.csv”);

在Hadoop中这是我们为您在HDFS文件:

尝试{ 文件系统FS = FileSystem.get(context.getConfiguration());

if (fs.exists(new Path("/user/username/path/of/file/inhdfs"))) { 
     System.out.println("File exists"); 
} 

} catch (IOException e) { 
e.printStackTrace(); 

}

+0

这部分不是问题(请参阅上面的我的回复)。在setup()中,文件读取正常,参数值也一样。但由于某些原因,这些值在map()内不可访问。我会注意到,我只是在map()中插入了conf.getInt()语句的第二个副本,并且程序运行得非常完美,但对于每个映射都加载它并没有效率,我认为还有更好的方法。 – user1956609