Example of a large dataset re-processor (text file IO in Java)

With almost 100,000,000 records to process in two data files, that do not have unified formats, a processor tool had to be crafted to handle both formats, then distill it down to a 3rd smaller and more concise format.

The following source code is an example of how I accomplished that task using Java.

/*
------------------------------------------------------------------------
 $Id: fileRepair.java,v 1.1 2012/03/21 18:31:47 ddemartini Exp $
 $Revision: 1.3 $
 $Date: 2012/03/21 18:31:47 $
 $Name:  $
------------------------------------------------------------------------
 Loader 
------------------------------------------------------------------------
    colname[0]   = "seq_num";	
    colname[1]   = "ip";  
    colname[2]   = "mode";   - drop
    colname[3]   = "property";  
    colname[4]   = "threat";    
    colname[5]   = "desc"; - drop
    colname[6]   = "meta";
    colname[7]   = "detected";  - drop
    colname[8]   = "det_method"; 
    colname[9]   = "reported"; - drop
    colname[10]  = "rpt_method";
    colname[11]  = "target"; - drop
    colname[12]  = "source";
------------------------------------------------------------------------
*/

import base.*;   /*  this contains base and util classes */
import java.io.UnsupportedEncodingException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.PrintWriter;
import java.io.File;
import java.io.BufferedReader;
import java.nio.ByteBuffer;
import java.util.Hashtable;
import java.util.regex.Pattern;

public class fileRepair{

  private static int skip          = Integer.parseInt(Util.envOrProp("skip")); // number of lines to skip from top
  private static int tStep         = 600000;  // milisecond step for status update (600000 = 10 minutes)
  private static long totInserts   = 0;  // counter for total inserts
  private static int [] fuse       = { 0,1,3,4,6,8,10,12 }; // these are the only fields that will be loaded.
  private static String inFile     = Util.envOrProp("in");
  private static String outFile    = Util.envOrProp("out");
  private static Hashtable srcMap;
 
  public static void main (String[] args) throws Exception {
    System.out.println("inFile  "+inFile);
    System.out.println("outFile "+outFile);

    srcMap = new Hashtable();
    /* OK, this is likely the most rudimently way to od this, but so be it... it's done */
    srcMap.put("19","SINKHOLE_KF3");
    srcMap.put("51","SINKHOLE_KQ11");
    srcMap.put("1000","EXTERN");
    srcMap.put("1009","INTERN_BN3");
    srcMap.put("1011","INTERN_BN9");
    srcMap.put("1012","INTERN_PD4");
    srcMap.put("1014","INTERN_PD1");
    srcMap.put("1025","EMAIL");
    srcMap.put("1027","CLIENT_AP82");
    srcMap.put("1033","CLIENT_AP49");
    srcMap.put("prop_id","source");
    srcMap.put("","UNKNOWN");
    srcMap.put(" ","UNKNOWN");
    srcMap.put("0","UNKNOWN");

    /* Process the file */
    processFile(inFile,outFile); 

  }  /* main */

  public static void processFile(String iFile, String oFile) throws Exception {
    // configure IO
    long tStart;         // get system clock time for a timer
    long tNext;          // get system clock time for a timer
    long tempIns   = 0;  // temporary insert counter
    long currline  = 0;  // line counter
    Pattern delim  = Pattern.compile("[\t]");  // pre-compile the splitter

    try {
      FileReader fRead      = new FileReader(iFile);
      BufferedReader bRead  = new BufferedReader(fRead);
      FileWriter fWrite     = new FileWriter(oFile);
      PrintWriter pWrite    = new PrintWriter(fWrite);

      String line; // used for temporary storage
      
      // loop through the intake file
      tStart = System.currentTimeMillis();   // get system clock time for a timer
      tNext  = tStart + tStep; // set next step unit.

      while ((line = bRead.readLine()) != null) {
        /* implement line skipper */
        if(currline++ < skip) {
          continue;
        }
        // Parse he line on tabs.
        String [] fields = delim.split(line);  // split records based on pre-compiled regex object
   
        // Count the columns  --  if only 11 columns, add the source to the stream. 
        if(fields.length < 13) {
          // re-parse with the extra required data dropped onto the end //
          fields = delim.split(line+"\t0\t"+srcMap.get(fields[2]).toString());
        }

        // Synthesize the desired output line
        StringBuffer newLine = new StringBuffer(fields[fuse[0]]);   // list of strings in columns
        for(int f=1;f tNext) {
          System.out.println("\t"+tempIns);
          tempIns = 0;
          tNext += tStep;
        }
      }
      /* close the file handles */      
      fRead.close();
      fWrite.close();

    } catch ( Exception e ) {
      System.out.println("Fatal Error ("+e+") Encountered - quitting");
      return; 
    }
    // Report total records written, and time consumed.
    long nanEnd = System.currentTimeMillis();
    System.out.println(totInserts+" lines processed");
    System.out.println("Completed loading data at "+nanEnd);
    System.out.println("Total Load Time ("+(nanEnd-tStart)+") - "+((nanEnd-tStart)/1000)+" seconds");    
    return;
  }  /* processFile */

} /* fileRepair */

The first run against a 71 million line file completed in under 10 minutes.

     [java] inFile  ~/workbench/all_ips.dta
     [java] outFile ~/workbench/ips.1.dta
     [java] 71,244,397 lines processed

Total time: 9 minutes 41 seconds