If you want to guess what your insert is, here follows a piece of code I wrote using Bamtools API :
File biostar14384.cpp
:
#include <iostream>
#include <fstream>
#include <ctime>
#include "api/BamReader.h"
#include "api/BamWriter.h"
using namespace std;
using namespace BamTools;
// Bin of 20 bp for insert size dist
#define BIN 20
// Max insert value in dist
#define MAX_V 1000
// Max index value in dist
#define MAX_I MAX_V/BIN
// Array of length MAXV/BIN with all values
// initialized to zero
int dist [MAX_I+1] = {0};
int main(int argc, char* argv[])
{
// Reading input parameter (BAM file)
string inputBamFile;
char * outputPath;
if ( argc == 3 )
{
inputBamFile = argv[1];
outputPath = argv[2];
}
else
{
cerr << "Wrong number of arguments. Requires 2." << endl;
cerr << "Input BAM & output file path." << endl;
return 1;
}
// Open the BAM file for reading
BamReader reader;
if (!reader.Open(inputBamFile))
{
cerr << "Could not open input BAM file." << endl;
return 1;
}
// Opening output file for writing
char outputFile [200];
strcpy (outputFile,outputPath);
strcat (outputFile,"/insertSizeDist.txt");
ofstream outfile (outputFile);
if ( !outfile )
{
// outfile is bad
cerr << "Bad output filename : " << outputFile << endl;
return EXIT_FAILURE;
}
time_t myTime = time(NULL);
cout << asctime(localtime(&myTime));
cout << "Generating Insert Size Distribution...";
// Now processing each record of the BAM file
BamAlignment al;
int count = 0;
while (reader.GetNextAlignment(al)) {
int32_t tag;
int32_t insertSize;
int index;
if (al.IsPaired())
{
if (al.IsFirstMate())
{
if (al.IsMapped())
{
if (al.IsMateMapped())
{
count++;
insertSize = abs(al.InsertSize);
index = insertSize/BIN;
if (index > MAX_I) { index = MAX_I; }
dist[index]++;
}
}
}
}
}
reader.Close();
// Now printing the distribution to output file
outfile << ">>INSERT_SIZE_DISTRIBUTION" << endl;
outfile << "RANGE\tVALUE" << endl;
int i;
short int inf;
short int sup;
for(i=0; i < MAX_I; i++)
{
inf = i*BIN;
sup = (i+1)*BIN-1;
outfile << inf << "_" << sup << "\t" << dist[i] << endl;
}
inf = MAX_I*BIN;
outfile << inf << "_" << "INF" << "\t" << dist[MAX_I] << endl;
outfile.close();
cout << "done." << endl;
cout << count << " pairs used (both reads mapped)." << endl;
myTime = time(NULL);
cout << asctime(localtime(&myTime));
return 0;
}
Compile with :
c++ -I /...../bamtools/include/ -L /...../bamtools/lib/ -o biostar14384 biostar14384.cpp -lz -lbamtools
Execute :
./biostar14384 bams/5183_8_sw.mkdup.bam /my/output/dir/
Standard Output :
Wed Nov 16 15:36:12 2011
Generating Insert Size Distribution...done.
27065940 pairs used (both reads mapped).
Wed Nov 16 15:40:25 2011
File /my/output/dir/insertSizeDist.txt :
>>INSERT_SIZE_DISTRIBUTION
RANGE VALUE
0_19 125402
20_39 2556
40_59 2718
60_79 1859
80_99 2963
100_119 147176
120_139 174268
140_159 186298
160_179 192914
180_199 198538
200_219 207544
220_239 218097
240_259 237805
260_279 278922
280_299 354523
300_319 520249
320_339 1168630
340_359 2503095
360_379 3302285
380_399 3274498
400_419 2943622
420_439 2583942
440_459 2257605
460_479 1953723
480_499 1659179
500_519 1301254
520_539 794974
540_559 314170
560_579 76720
580_599 15034
600_619 6246
620_639 4831
640_659 4758
660_679 5138
680_699 1289
700_719 1274
720_739 1198
740_759 962
760_779 863
780_799 860
800_819 807
820_839 911
840_859 814
860_879 698
880_899 250
900_919 190
920_939 233
940_959 232
960_979 215
980_999 168
1000_INF 33440
Hope this helps !
Here is the SAM file specs: http://samtools.sourceforge.net/SAM1.pdf
It might be in the SAM headers. Look for a "@RG PI" header. If it's not in the headers, then you'll probably have to just parse through the SAM file and extrapolate it yourself.
Yes good point: I saw this PI the specs too in RG header but I forgot to say that my sam file does not have this parameter set in RG. When you say extrapolate it yourself, you mean doing for instance calculating the average of lengths between read pairs? (I found something like 297).
Use of this site constitutes acceptance of our User Agreement and Privacy Policy.
What do you need exactly ... a full distribution , median/average ? Based on all pairs or only uniquely mapped ?