|
/** |
|
* Author: Pierre Lindenbaum PhD. @yokofakun |
|
* http://www.biostars.org/p/63016/ |
|
* Compare two BAM files |
|
* tested with picard.1-62 and berkeleydb java edition 4.1.10 |
|
* |
|
* compilation & exec: |
|
* javac -cp path/to/picard.jar:path/to/sam.jar:path/to/je-4.1.10.jar Biostar63016.java |
|
* mkdir TMP |
|
* java -cp path/to/picard.jar:path/to/sam.jar:path/to/je-4.1.10.jar:. Biostar63016 -d TMP file1.bam file2.bam |
|
*/ |
|
|
|
import java.io.File; |
|
import java.io.IOException; |
|
import java.util.ArrayList; |
|
import java.util.HashMap; |
|
import java.util.HashSet; |
|
import java.util.Iterator; |
|
import java.util.List; |
|
import java.util.Map; |
|
import java.util.Set; |
|
import java.util.logging.Logger; |
|
|
|
import net.sf.samtools.SAMFileReader; |
|
import net.sf.samtools.SAMFileReader.ValidationStringency; |
|
import net.sf.samtools.SAMRecord; |
|
import net.sf.samtools.SAMSequenceRecord; |
|
|
|
import com.sleepycat.bind.tuple.StringBinding; |
|
import com.sleepycat.bind.tuple.TupleInput; |
|
import com.sleepycat.bind.tuple.TupleOutput; |
|
import com.sleepycat.je.Cursor; |
|
import com.sleepycat.je.Database; |
|
import com.sleepycat.je.DatabaseConfig; |
|
import com.sleepycat.je.DatabaseEntry; |
|
import com.sleepycat.je.Environment; |
|
import com.sleepycat.je.EnvironmentConfig; |
|
import com.sleepycat.je.LockMode; |
|
import com.sleepycat.je.OperationStatus; |
|
import com.sleepycat.je.Transaction; |
|
|
|
|
|
public class Biostar63016 |
|
{ |
|
private static final Logger LOG=Logger.getLogger(Biostar63016.class.getName()); |
|
private static final String DATABASENAME="read2pos"; |
|
private File dbHome; |
|
private Environment environment=null; |
|
private Database database=null; |
|
private Transaction txn; |
|
private int currentSamFileIndex=0; |
|
|
|
private static class Match |
|
{ |
|
byte tid; |
|
int pos; |
|
|
|
@Override |
|
public int hashCode() |
|
{ |
|
int result = 1; |
|
result = 31 * result + pos; |
|
result = 31 * result + tid; |
|
return result; |
|
} |
|
@Override |
|
public boolean equals(Object obj) |
|
{ |
|
if (this == obj) { return true; } |
|
if (obj == null) { return false; } |
|
Match other = (Match) obj; |
|
if (tid != other.tid) { return false; } |
|
if(tid==-1) return true; |
|
if (pos != other.pos) { return false; } |
|
return true; |
|
} |
|
@Override |
|
public String toString() |
|
{ |
|
if(tid<0) return "unmapped"; |
|
return String.valueOf(tid)+":"+pos; |
|
} |
|
} |
|
|
|
/** fileid to matches */ |
|
private List<Set<Match>> decode(final DatabaseEntry data) |
|
throws IOException |
|
{ |
|
TupleInput in=new TupleInput(data.getData()); |
|
final byte nfiles=2; |
|
List<Set<Match>> L=new ArrayList<Set<Match>>(nfiles); |
|
for(int i=0;i< nfiles;++i) |
|
{ |
|
byte nmatches=in.readByte(); |
|
Set<Match> set=new HashSet<Match>(nmatches); |
|
L.add(set); |
|
for(int j=0;j< nmatches;++j) |
|
{ |
|
Match m=new Match(); |
|
m.tid=in.readByte(); |
|
m.pos=in.readInt(); |
|
set.add(m); |
|
} |
|
} |
|
|
|
return L; |
|
} |
|
|
|
private void print(final Set<Match> set,Map<Integer,String> tid2name) |
|
{ |
|
boolean first=true; |
|
for(Match m:set) |
|
{ |
|
if(!first)System.out.print(','); |
|
first=false; |
|
if(m.tid<0){ System.out.print("unmapped"); continue;} |
|
String seqName=tid2name.get(m.tid); |
|
if(seqName==null) seqName="tid-"+m.tid; |
|
System.out.print(String.valueOf(seqName+":"+(m.pos))); |
|
} |
|
if(first) System.out.print("(empty)"); |
|
} |
|
|
|
private void encode(DatabaseEntry data,final List<Set<Match>> L) |
|
{ |
|
TupleOutput out=new TupleOutput(); |
|
for(Set<Match> set:L) |
|
{ |
|
out.writeByte((byte)Math.min(Byte.MAX_VALUE, set.size())); |
|
int count=0; |
|
for(Match m:set) |
|
{ |
|
if(++count>=Byte.MAX_VALUE) break; |
|
out.writeByte(m.tid); |
|
out.writeInt(m.pos); |
|
} |
|
} |
|
data.setData(out.getBufferBytes(),out.getBufferOffset(),out.getBufferLength()); |
|
} |
|
|
|
private void run(String[] args) |
|
throws Exception |
|
{ |
|
int optind=0; |
|
while(optind< args.length) |
|
{ |
|
if(args[optind].equals("-h") || |
|
args[optind].equals("-help") || |
|
args[optind].equals("--help")) |
|
{ |
|
System.err.println("Pierre Lindenbaum PhD. 2013"); |
|
System.err.println("Options:"); |
|
System.err.println(" -h help; This screen."); |
|
return; |
|
} |
|
else if(args[optind].equals("-d") && optind+1< args.length) |
|
{ |
|
this.dbHome=new File(args[++optind]); |
|
} |
|
else if(args[optind].equals("--")) |
|
{ |
|
optind++; |
|
break; |
|
} |
|
else if(args[optind].startsWith("-")) |
|
{ |
|
System.err.println("Unknown option "+args[optind]); |
|
return; |
|
} |
|
else |
|
{ |
|
break; |
|
} |
|
++optind; |
|
} |
|
if(this.dbHome==null) |
|
{ |
|
System.err.println("db-home undefined"); |
|
return; |
|
} |
|
if(args.length-optind!=2) |
|
{ |
|
System.err.println("Expected 2 bams"); |
|
return; |
|
} |
|
Map<Integer,String> tid2name=null; |
|
DatabaseEntry key=new DatabaseEntry(); |
|
DatabaseEntry data=new DatabaseEntry(); |
|
|
|
EnvironmentConfig envConfig= new EnvironmentConfig(); |
|
envConfig.setAllowCreate(true); |
|
envConfig.setReadOnly(false); |
|
envConfig.setConfigParam(EnvironmentConfig.LOG_FILE_MAX,"250000000"); |
|
envConfig.setTransactional(true); |
|
this.environment= new Environment(this.dbHome, envConfig); |
|
|
|
this.txn=this.environment.beginTransaction(null, null); |
|
DatabaseConfig cfg= new DatabaseConfig(); |
|
cfg.setAllowCreate(true); |
|
cfg.setReadOnly(false); |
|
cfg.setTransactional(true); |
|
this.database= this.environment.openDatabase(txn,DATABASENAME,cfg); |
|
|
|
currentSamFileIndex=0; |
|
while(optind<args.length) |
|
{ |
|
long nReads=0L; |
|
File samFile=new File(args[optind++]); |
|
SAMFileReader samFileReader=new SAMFileReader(samFile); |
|
samFileReader.setValidationStringency(ValidationStringency.SILENT); |
|
if(samFileReader.getFileHeader().getSequenceDictionary().getSequences().size()>=Byte.MAX_VALUE) |
|
{ |
|
System.err.println("Too many Ref Sequences . Limited to "+Byte.MAX_VALUE); |
|
return; |
|
} |
|
if(tid2name==null) |
|
{ |
|
tid2name=new HashMap<Integer, String>(); |
|
for(SAMSequenceRecord ssr:samFileReader.getFileHeader().getSequenceDictionary().getSequences()) |
|
{ |
|
tid2name.put(ssr.getSequenceIndex(), ssr.getSequenceName()); |
|
} |
|
} |
|
|
|
for(Iterator<SAMRecord> iter=samFileReader.iterator(); |
|
iter.hasNext(); ) |
|
{ |
|
if(nReads++%10000000==0) LOG.info("in "+samFile+" "+nReads); |
|
SAMRecord rec=iter.next(); |
|
StringBinding.stringToEntry(rec.getReadName(), key); |
|
|
|
List<Set<Match>> matches=null; |
|
if(this.database.get(this.txn, key, data, LockMode.DEFAULT)==OperationStatus.SUCCESS) |
|
{ |
|
matches=decode(data); |
|
} |
|
else |
|
{ |
|
matches=new ArrayList<Set<Match>>(); |
|
for(int i=0;i< 2;++i) matches.add(new HashSet<Match>()); |
|
} |
|
|
|
Match match=new Match(); |
|
match.tid=(byte)rec.getReferenceIndex().intValue(); |
|
match.pos=rec.getAlignmentStart(); |
|
matches.get(this.currentSamFileIndex).add(match); |
|
encode(data,matches); |
|
if(this.database.put(this.txn, key, data)!=OperationStatus.SUCCESS) |
|
{ |
|
System.err.println("BDB error."); |
|
System.exit(-1); |
|
} |
|
} |
|
samFileReader.close(); |
|
this.currentSamFileIndex++; |
|
} |
|
//compute the differences for each read |
|
key=new DatabaseEntry(); |
|
Cursor c=this.database.openCursor(txn,null);; |
|
while(c.getNext(key, data,LockMode.DEFAULT)==OperationStatus.SUCCESS) |
|
{ |
|
System.out.print(StringBinding.entryToString(key)); |
|
List<Set<Match>> matches=decode(data); |
|
final Set<Match> first=matches.get(0); |
|
final Set<Match> second=matches.get(1); |
|
if(first.equals(second)) |
|
{ |
|
System.out.print("\tEQ\t"); |
|
} |
|
else |
|
{ |
|
System.out.print("\tNE\t"); |
|
} |
|
print(first,tid2name); |
|
System.out.print("\t"); |
|
print(second,tid2name); |
|
System.out.println(); |
|
} |
|
c.close(); |
|
this.database.close(); |
|
this.environment.removeDatabase(txn, DATABASENAME); |
|
this.txn.commit(); |
|
this.environment.close(); |
|
} |
|
|
|
public static void main(String[] args) throws Exception |
|
{ |
|
new Biostar63016().run(args); |
|
} |
|
|
|
} |
If the aligners in use keep all the reads in the input order (bwa/bowtie/soap2 among many others do that by default), you can simply use
paste
or reading two files line by line.How much memory do you have? If they are just read ids, 30 million (assuming 30 characters each) will probably take around a gig of memory. It's pretty reasonable.
I've seen python dictionaries go above 10 gigabytes. So, depending on the size and the memory available, I think python could work here.
you don't need dictionary, use set() in python