New help with performance...
Hi everyone,
i have a really big text file with around 1.3 million lines and i have to transform them. The file looks pretty much like this:
# some
# lines
# of
# comment
# -
c_idfoocu_idbar_id value_1 value_2
22014433758456123CC
22014433758456123CC
22014433758456123CC
22014433758456123--
22014433758456123CC
22014433758456123CC
22014433758456123CC
22014433758456789BB
22014433758456789BB
22014433758456789BB
What i have to do now is to transform the file that it looks like this:
c_idcu_id123789
22456C_CB_B
22456C_CB_B
22456C_CB_B
22456-_-B_B
22456C_CB_B
22456C_CB_B
22456C_CB_B
22456C_CB_B
22456C_CB_B
22456C_CB_B
So, basically instead of a column bar i need a column for each bar_id and the values underneath. So what i did was to write the following code:
publicclass Transformer
{
private String _commentChar, _sepChar;
private BufferedReader _br;
private HashMap<String, String> _map;
private Vector<String> _snps;
public Transformer()
{
_commentChar = Main.getProperty(AppConstants.COMMENT_CHAR);
_sepChar = Main.getProperty(AppConstants.SNP_SEP_CHAR);
_map =new HashMap<String, String>();
_snps =new Vector<String>();
}
publicvoid transformFile(File f)
throws FileNotFoundException, IOException
{
String line =null;
String header =null;
String content ="";
String[] values =null;
//id refers to the cu_id from above
String id ="";
String temp ="";
boolean beginFound =false;
int count = 0;
Utils u =new Utils();
_br =new BufferedReader(new FileReader(f));
while(( line = _br.readLine()) !=null)
{
if(!beginFound)
{
if(!line.startsWith(_commentChar))
{
beginFound =true;
values = getLineAsArray(line);
header = values[0]+_sepChar+values[2];
}
}
else
{
values = getLineAsArray(line);
if(values.length > 3)
{
id = values[2].trim();
String snp = values[3].trim();
if(_map.containsKey(id))
{
temp = _map.get(id);
content = temp.substring(temp.indexOf("\n"), temp.length());
temp = temp.substring(0, temp.indexOf("\n"));
temp += _sepChar+ content+u.getEmptyStringWithLength(id.length())
+getAlleles(line, snp);
_map.put(id, temp);
}
else
{
temp ="\n" + values[0]+_sepChar+id + u.getEmptyStringWithLength(id.length())
+ getAlleles(line, snp);
_map.put(id, temp);
}
if(!_snps.contains(snp)){
header += _sepChar+snp;
}
_snps.add(snp);
}
}
System.out.println("Zeile gelesen: "+ ++count);
}
_br.close();
writeFile(f, header);
}
privatevoid writeFile(File f, String header)
throws FileNotFoundException
{
PrintStream ps =new PrintStream(new File(getExportFilename(f)));
String key ="";
int count = 0;
ps.println(header);
for(Iterator<String> i = _map.keySet().iterator(); i.hasNext(); )
{
key = i.next();
ps.print(_map.get(key));
System.out.println("Zeile geschrieben: "+ ++count);
}
ps.close();
}
private String getExportFilename(File f)
{
Utils u =new Utils();
String s = u.getFilenameWithoutExtension(f);
System.out.println("Filename "+u.getPath(f,true)+s+"_2.txt");
return u.getPath(f,true)+s+"_2.txt";
}
private String getValue(String line,int pos)
{
StringTokenizer st =new StringTokenizer(line);
String s =null;
int count = 0;
if(st.countTokens() > 1)
{
while(st.hasMoreTokens() && count != pos)
{
s = st.nextToken();
count++;
}
}
return s.trim();
}
private String[] getLineAsArray(String line)
{
return line.split(_sepChar);
}
private String getAlleles(String line, String snp)
{
String s = line.substring(line.indexOf(snp)+snp.length(), line.length());
s = s.trim().replaceAll(" ","");
s = s.trim().replaceAll(_sepChar,"");
char[] c = s.toCharArray();
s =null;
StringBuffer sb =new StringBuffer();
sb.append(c[0]);
sb.append("_");
sb.append(c[1]);
return sb.toString();
}
private String getPID(String line)
{
StringTokenizer st =new StringTokenizer(line);
String s =null;
int count = 0;
if(st.countTokens() > 1)
{
while(st.hasMoreTokens() && count != 2)
{
s = st.nextToken();
count++;
}
}
return s.trim();
}
}
The problem now is that it takes awfully long (about 250000 line in 90 minutes) and i am wondering if this is normal. After all, the file is really big, but maybe there is a way to make it faster.
Appreciate any advice!
Chris

