Using Hdf5 To Store Bio-Data

Entering edit mode

15.4 years ago

Pierre Lindenbaum 166k

Hi all,

Has anybody ever used the HDF5 API to store some biological data (genotypes...). I know about this kind of reference (BioHDF...) but I'm looking for some source code I could browse to understand how I can access data faster.

Pierre

PS: hum, I'm a new user. I'm not allowed to add the following tags: storage database hdf5 source code

hdf5 storage database • 12k views

ADD COMMENT • link updated 21 months ago by Ram 45k • written 15.4 years ago by Pierre Lindenbaum 166k

Entering edit mode

15.4 years ago

Abhishek Tiwari ▴ 120

This might be useful for you. This code snippet read the Simulation data and manipulate in HDF5.

	Chnage first 5 includes from "" to open and close tags
	#include "stdlib.h"
	#include "stdio.h"
	#include "string.h"
	#include "hdf5.h"
	#include "hdf5_hl.h"

	#include "common.h"
	#include "hdf5_data.h"
	#include "metadata/simulation.h"
	#include "metadata/simulation_list.h"

	enum FileIntent
	{
	READING,
	WRITING,
	NEITHER
	};

	struct HDF5Data
	{
	hid_t file;
	hid_t group;
	int ptCreated;
	hid_t pt;
	enum FileIntent intent;
	};

	/* Iterator function for pulling out existing simulation data.
	* Currently assumes we are only dealing with our own files but could be
	* made smarter to find only groups that contain the required dataspace's.
	*/
	static herr_t
	rootIterator(hid_t group,const char name,void _iter)
	{
	struct SimulationList* list = (struct SimulationList*)_iter;
	if (list)
	{
	struct Simulation* s = CreateSimulation();
	simulationSetName(s,name);
	simulationListAppend(list,s);
	DestroySimulation(&s);
	return(0);
	}
	return(-1);
	}

	struct HDF5Data* CreateHDF5Data()
	{
	struct HDF5Data* hdf5 = (struct HDF5Data*)malloc(sizeof(struct HDF5Data));
	if (hdf5)
	{
	hdf5->file = (hid_t)NULL;
	hdf5->group = (hid_t)NULL;
	hdf5->pt = (hid_t)NULL;
	hdf5->ptCreated = 0;
	hdf5->intent = NEITHER;
	}
	return(hdf5);
	}

	int DestroyHDF5Data(struct HDF5Data** hdf5)
	{
	int code = ERR;
	struct HDF5Data* h5 = *hdf5;
	if (h5)
	{
	if (h5->ptCreated) H5PTclose(h5->pt);
	if (h5->group > 0) H5Gclose(h5->group);
	if (h5->file > 0) H5Fclose(h5->file);
	free(h5);
	code = OK;
	}
	hdf5 = (struct HDF5Data)NULL;
	return(code);
	}

	int hdf5DataOpenFileForWriting(struct HDF5Data* hdf5,const char* filename)
	{
	/* Open the hdf5 file for writing. */
	int code = ERR;
	if (hdf5)
	{
	hdf5->file =
	H5Fcreate(filename,H5F_ACC_TRUNC,H5P_DEFAULT,H5P_DEFAULT);
	if (hdf5->file < 0) code = ERR;
	else
	{
	hdf5->intent = WRITING;
	code = OK;
	}
	}
	return(code);
	}

	int hdf5DataOpenFileForReading(struct HDF5Data* hdf5,const char* filename)
	{
	/* Open the hdf5 file for writing. */
	int code = ERR;
	if (hdf5)
	{
	hdf5->file = H5Fopen(filename,H5F_ACC_RDONLY,H5P_DEFAULT);
	if (hdf5->file < 0) code = ERR;
	else
	{
	hdf5->intent = READING;
	code = OK;
	}
	}
	return(code);
	}

	int hdf5DataSetGroup(struct HDF5Data* hdf5,const char* groupName)
	{
	int code = ERR;
	if (hdf5 && (hdf5->file > 0))
	{
	if (hdf5->group > 0) H5Gclose(hdf5->group);
	if (hdf5->ptCreated) H5PTclose(hdf5->pt);
	hdf5->ptCreated = 0;
	if (groupName)
	{
	if (hdf5->intent == WRITING) hdf5->group =
	H5Gcreate(hdf5->file,groupName,/size_hint/0);
	else hdf5->group = H5Gopen(hdf5->file,groupName);
	}
	code = OK;
	}
	return(code);
	}

	int hdf5WriteSimulationModelURI(struct HDF5Data* hdf5,const char* uri)
	{
	herr_t status;
	hid_t datatype,dataspace,dataset;
	hsize_t dim[1];

	if ((hdf5 == NULL) \|\| (hdf5->intent != WRITING))
	{
	fprintf(stderr,"Attempting to write to a reading file.\n");
	return(ERR);
	}
	if (hdf5->group <= 0)
	{
	fprintf(stderr,"Missing HDF5 group.\n");
	return(ERR);
	}
	if (uri == NULL)
	{
	fprintf(stderr,"Attempting to write invalid URI to data file.\n");
	return(ERR);
	}
	if (strlen(uri) > HDF5_STRING_LENGTH-1)
	{
	fprintf(stderr,
	"URI too long - fix the hdf5WriteSimulationModelURI code.\n");
	return(ERR);
	}
	char* localURI = (char*)calloc(HDF5_STRING_LENGTH,1);
	strcpy(localURI,uri);
	/* Make a string data type */
	datatype = H5Tcopy(H5T_C_S1);
	/* set the fixed string length */
	status = H5Tset_size(datatype,HDF5_STRING_LENGTH);
	/* and we're gonna use C-like null-terminated string */
	status = H5Tset_strpad(datatype,H5T_STR_NULLTERM);
	/* Create a simple memory space of the correct size */
	dim[0] = 1;
	dataspace = H5Screate_simple(1,dim,NULL);
	/* and create the dataset to write to */
	dataset = H5Dcreate(hdf5->group,SIMULATION_MODEL_URI_NAME,datatype,
	dataspace,H5P_DEFAULT);
	/* Write the URI to the file */
	status = H5Dwrite(dataset,datatype,H5S_ALL,H5S_ALL,H5P_DEFAULT,localURI);
	/* clean up */
	H5Dclose(dataset);
	H5Sclose(dataspace);
	H5Tclose(datatype);
	/H5Fflush(hdf5->file,H5F_SCOPE_GLOBAL);/
	free(localURI);
	return(OK);
	}

	int hdf5WriteFieldHeader(struct HDF5Data* hdf5,int N,char* names)
	{
	herr_t status;
	hid_t datatype,dataspace,dataset;
	hsize_t dim[1];

	if ((hdf5 == NULL) \|\| (hdf5->intent != WRITING))
	{
	fprintf(stderr,"Attempting to write to a reading file.\n");
	return(ERR);
	}
	if (hdf5->group <= 0)
	{
	fprintf(stderr,"Missing HDF5 group.\n");
	return(ERR);
	}
	/* Make a string data type */
	datatype = H5Tcopy(H5T_C_S1);
	/* set the fixed string length */
	status = H5Tset_size(datatype,HDF5_STRING_LENGTH);
	/* and we're gonna use C-like null-terminated strings */
	status = H5Tset_strpad(datatype,H5T_STR_NULLTERM);
	/* Create a simple memory space of the correct size */
	dim[0] = N;
	dataspace = H5Screate_simple(1,dim,NULL);
	/* and create the dataset to write to */
	dataset = H5Dcreate(hdf5->group,FIELD_HEADER_DATA_NAME,datatype,
	dataspace,H5P_DEFAULT);
	/* Write the field names to the file */
	status = H5Dwrite(dataset,datatype,H5S_ALL,H5S_ALL,H5P_DEFAULT,names);
	/* clean up */
	H5Dclose(dataset);
	H5Sclose(dataspace);
	H5Tclose(datatype);
	/H5Fflush(hdf5->file,H5F_SCOPE_GLOBAL);/
	return(OK);
	}

	int hdf5WriteData(struct HDF5Data* hdf5,int N,double* data)
	{
	herr_t status = 0;

	if ((hdf5 == NULL) \|\| (hdf5->intent != WRITING))
	{
	fprintf(stderr,"Attempting to write to a reading file.\n");
	return(ERR);
	}
	if (hdf5->group <= 0)
	{
	fprintf(stderr,"Missing HDF5 group.\n");
	return(ERR);
	}
	if (!hdf5->ptCreated)
	{
	/* create a fixed length packet table in the file */
	hdf5->pt = H5PTcreate_fl(hdf5->group,DATA_NAME,H5T_NATIVE_DOUBLE,
	/chunk size ??/sizeof(double)*N);
	hdf5->ptCreated = 1;
	}
	if (hdf5->ptCreated)
	{
	/* Write a packet to the packet table */
	status = H5PTappend(hdf5->pt,N,(void*)data);
	}
	return(OK);
	}

	char* hdf5ReadSimulationModelURI(struct HDF5Data* hdf5)
	{
	herr_t status;
	hid_t datatype,dataspace,dataset;
	char uri = (char)NULL;

	if ((hdf5 == NULL) \|\| (hdf5->intent != READING))
	{
	fprintf(stderr,"Attempting to read from a writing file.\n");
	return((char*)NULL);
	}
	if (hdf5->group <= 0)
	{
	fprintf(stderr,"Missing HDF5 group.\n");
	return(ERR);
	}
	/* open the dataset */
	dataset = H5Dopen(hdf5->group,SIMULATION_MODEL_URI_NAME);
	dataspace = H5Dget_space(dataset);
	/* get the data type */
	datatype = H5Dget_type(dataset);
	/* allocate memory */
	uri = (char*)malloc(HDF5_STRING_LENGTH);
	/* read in the data */
	status = H5Dread(dataset,datatype,H5S_ALL,H5S_ALL,H5P_DEFAULT,uri);
	if (status < 0)
	{
	fprintf(stderr,"Error getting the dimension of the field header.\n");
	free(uri);
	H5Sclose(dataspace);
	H5Tclose(datatype);
	H5Dclose(dataset);
	return((char*)NULL);
	}
	/* clean up */
	H5Sclose(dataspace);
	H5Tclose(datatype);
	H5Dclose(dataset);
	return(uri);
	}

	char** hdf5ReadFieldHeader(struct HDF5Data* hdf5,int* N)
	{
	herr_t status;
	hid_t datatype,dataspace,dataset;
	hsize_t dim[1];
	char tmp,names = (char*)NULL;
	char fields = (char)NULL;
	int i;

	if ((hdf5 == NULL) \|\| (hdf5->intent != READING))
	{
	fprintf(stderr,"Attempting to read from a writing file.\n");
	return((char**)NULL);
	}
	if (hdf5->group <= 0)
	{
	fprintf(stderr,"Missing HDF5 group.\n");
	return(ERR);
	}
	/* open the dataset */
	dataset = H5Dopen(hdf5->group,FIELD_HEADER_DATA_NAME);
	dataspace = H5Dget_space(dataset);
	/* get the data type */
	datatype = H5Dget_type(dataset);
	/* get the size and allocate memory */
	status = H5Sget_simple_extent_dims(dataspace,dim,NULL);
	if (status < 0)
	{
	fprintf(stderr,"Error getting the dimension of the field header.\n");
	H5Sclose(dataspace);
	H5Tclose(datatype);
	H5Dclose(dataset);
	return((char**)NULL);
	}
	names = (char)malloc(HDF5_STRING_LENGTHdim[0]);
	/* read in the data */
	status = H5Dread(dataset,datatype,H5S_ALL,H5S_ALL,H5P_DEFAULT,names);
	if (status < 0)
	{
	fprintf(stderr,"Error getting the dimension of the field header.\n");
	free(names);
	H5Sclose(dataspace);
	H5Tclose(datatype);
	H5Dclose(dataset);
	return((char**)NULL);
	}
	/* clean up */
	H5Sclose(dataspace);
	H5Tclose(datatype);
	H5Dclose(dataset);
	/* split up the names */
	fields = (char*)malloc(sizeof(char)*dim[0]);
	tmp = names;
	for (i=0;i<dim[0];i++)
	{
	fields[i] = (char*)malloc(strlen(tmp)+1);
	strcpy(fields[i],tmp);
	tmp += HDF5_STRING_LENGTH;
	}
	free(names);
	*N = dim[0];
	return(fields);
	}

	double* hdf5ReadData(struct HDF5Data* hdf5,int N)
	{
	herr_t status = 0;
	double* data = (double)malloc(sizeof(double)N);

	if ((hdf5 == NULL) \|\| (hdf5->intent != READING))
	{
	fprintf(stderr,"Attempting to read from a writing file.\n");
	return((double*)NULL);
	}
	if (hdf5->group <= 0)
	{
	fprintf(stderr,"Missing HDF5 group.\n");
	return(ERR);
	}
	if (!hdf5->ptCreated)
	{
	/* open the packet table */
	hdf5->pt = H5PTopen(hdf5->group,DATA_NAME);
	/* and make sure we're at the start of the table */
	H5PTcreate_index(hdf5->pt);
	hdf5->ptCreated = 1;
	}
	if (hdf5->ptCreated)
	{
	/* get N packets from the packet table */
	status = H5PTget_next(hdf5->pt,N,(void*)data);
	if (status<0)
	{
	free(data);
	data = (double*)NULL;
	}
	}
	return(data);
	}

	struct SimulationList* hdf5ReadSimulations(struct HDF5Data* hdf5)
	{
	struct SimulationList* simulations = (struct SimulationList*)NULL;

	if ((hdf5 == NULL) \|\| (hdf5->intent != READING))
	{
	fprintf(stderr,"Attempting to read from a writing file.\n");
	return(simulations);
	}
	/* look for any groups which are children of the root group */
	hid_t rootGroup = H5Gopen(hdf5->file,"/");
	if (rootGroup > 0)
	{
	simulations = CreateSimulationList();
	if (H5Giterate(rootGroup,"/",NULL,rootIterator,(void*)simulations)
	!= 0) DestroySimulationList(&simulations);
	H5Gclose(rootGroup);
	}
	return(simulations);
	}

view raw biostars-127.c hosted with ❤ by GitHub

ADD COMMENT • link updated 6.9 years ago by Ram 45k • written 15.4 years ago by Abhishek Tiwari ▴ 120

Entering edit mode

Thank you Abhishek !

ADD REPLY • link 15.4 years ago by Pierre Lindenbaum 166k

Entering edit mode

In the light of 3.7 years of more experience (hmm hmm) this should be marked as the correct answer instead of the one describing netCDF. As one can easily see the APIs are quite different.

ADD REPLY • link 11.7 years ago by Michael 56k

Entering edit mode

15.4 years ago

Istvan Albert 102k

In the GeneTrack software we have used HDF to store values for each genomic base. Its main advantage over other storage systems was that it was able to return consecutive values with minimal overhead.

For example it is extremely fast (ms) in retrieving say 100,000 consecutive values starting with a certain index.We used the Python bindings to HDF. An added advantage of these bindings is that they will return the data back as numpy arrays (very fast numerical operations).

Here is the relevant code that deals with HDF only: hdf.py

The HDF schema is set up in a different module, but in the end it simply something like:

class MySchema( IsDescription ):
    """
    Stores a triplet of float values for each index.
    """
    ix = IntCol  ( pos=1 )  # index
    wx = FloatCol( pos=2 )  # values on the W (forward) strand
    cx = FloatCol( pos=3 )  # value on the C (reverse) strand
    ax = FloatCol( pos=4 )  # weighted value on the combined W + C strands

ADD COMMENT • link updated 6.9 years ago by Ram 45k • written 15.4 years ago by Istvan Albert 102k

Entering edit mode

Still not the kind of source code I'm looking for but it is very interesting ! Your project reminds me Jan Aerts' Locus Tree : http://saaientist.blogspot.com/2009/04/locustree-searching-genomic-loci.html

ADD REPLY • link updated 5.8 years ago by Ram 45k • written 15.4 years ago by Pierre Lindenbaum 166k

Entering edit mode

Interesting link. You might want to also check out the PyTables documentation. That may have some use cases that you might be interested in. You can safely ignore the python related features: http://www.pytables.org/docs/manual/

ADD REPLY • link 15.4 years ago by Istvan Albert 102k

Entering edit mode

15.4 years ago

Fernando Muñiz ▴ 100

What I do have is a netCDF-3 based Java application that I could show you. NetCDF-3 is basically the same idea as HDF, but quite more limited as it cannot do compound datatypes among other limitations.

But here's a small test code example to toy with:

	package netCDF;

	import java.io.File;
	import ucar.ma2.<em>;
	import ucar.nc2.</em>;
	import java.io.IOException;
	import java.util.ArrayList;

	/**
	*****
	* @author Fernando Muñiz Fernandez</li>
	* IBE, Institute of Evolutionary Biology (UPF-CSIC)</li>
	* CEXS-UPF-PRBB</li>
	*****
	* THIS TO CREATE THE netCDF-3 GENOTYPE FILE
	*/

	public class CreateNetcdf {

	public static NetcdfFileWriteable setDimsAndAttributes(Integer studyId,
	String technology,
	String description,
	String strand,
	int sampleSetSize,
	int markerSetSize) throws InvalidRangeException, IOException {

	///////////// CREATE netCDF-3 FILE ////////////
	String genotypesFolder = "/media/data/genotypes";
	File pathToStudy = new File(genotypesFolder+"/netCDF_test");
	int gtSpan = constants.cNetCDF.Strides.STRIDE_GT;
	int markerSpan = constants.cNetCDF.Strides.STRIDE_MARKER_NAME;
	int sampleSpan = constants.cNetCDF.Strides.STRIDE_SAMPLE_NAME;

	String matrixName = "prototype";
	String writeFileName = pathToStudy+"/"+matrixName+".nc";
	NetcdfFileWriteable ncfile = NetcdfFileWriteable.createNew(writeFileName, false);

	// add dimensions
	Dimension samplesDim = ncfile.addDimension("samples", sampleSetSize);
	Dimension markersDim = ncfile.addDimension("markers", markerSetSize);
	Dimension gtSpanDim = ncfile.addDimension("span", gtSpan);
	ArrayList dims = new ArrayList();
	dims.add(samplesDim);
	dims.add(markersDim);
	dims.add(gtSpanDim);

	ArrayList markerGenotypeDims = new ArrayList();
	markerGenotypeDims.add(markersDim);
	markerGenotypeDims.add(markerSpan);

	ArrayList markerPositionDim = new ArrayList();
	markerPositionDim.add(markersDim);

	ArrayList markerPropertyDim32 = new ArrayList();
	markerPropertyDim32.add(markersDim);
	markerPropertyDim32.add(32);

	ArrayList markerPropertyDim16 = new ArrayList();
	markerPropertyDim16.add(markersDim);
	markerPropertyDim16.add(16);

	ArrayList markerPropertyDim8 = new ArrayList();
	markerPropertyDim8.add(markersDim);
	markerPropertyDim8.add(8);

	ArrayList markerPropertyDim2 = new ArrayList();
	markerPropertyDim2.add(markersDim);
	markerPropertyDim2.add(2);

	ArrayList markerPropertyDim1 = new ArrayList();
	markerPropertyDim1.add(markersDim);
	markerPropertyDim1.add(1);

	ArrayList sampleSetDims = new ArrayList();
	sampleSetDims.add(samplesDim);
	sampleSetDims.add(sampleSpan);

	// Define Marker Variables
	ncfile.addVariable("markerset", DataType.CHAR, markerGenotypeDims);
	ncfile.addVariableAttribute("markerset", constants.cNetCDF.Attributes.LENGTH, markerSetSize);

	ncfile.addVariable("marker_chromosome", DataType.CHAR, markerPropertyDim8);
	ncfile.addVariable("marker_position", DataType.CHAR, markerPropertyDim32);
	ncfile.addVariable("marker_position_int", DataType.INT, markerPositionDim);
	ncfile.addVariable("marker_strand", DataType.CHAR, markerPropertyDim8);

	ncfile.addVariable("marker_property_1", DataType.CHAR, markerPropertyDim1);
	ncfile.addVariable("marker_property_2", DataType.CHAR, markerPropertyDim2);
	ncfile.addVariable("marker_property_8", DataType.CHAR, markerPropertyDim8);
	ncfile.addVariable("marker_property_16", DataType.CHAR, markerPropertyDim16);
	ncfile.addVariable("marker_property_32", DataType.CHAR, markerPropertyDim32);

	// Define Sample Variables
	ncfile.addVariable("sampleset", DataType.CHAR, sampleSetDims);
	ncfile.addVariableAttribute("sampleset", constants.cNetCDF.Attributes.LENGTH, sampleSetSize);

	// Define Genotype Variables
	ncfile.addVariable("genotypes", DataType.CHAR, dims);
	ncfile.addVariableAttribute("genotypes", constants.cNetCDF.Attributes.GLOB_STRAND, "+/-");

	// add global attributes
	ncfile.addGlobalAttribute(constants.cNetCDF.Attributes.GLOB_STUDY, studyId);
	ncfile.addGlobalAttribute(constants.cNetCDF.Attributes.GLOB_TECHNOLOGY, "INTERNAL");
	ncfile.addGlobalAttribute(constants.cNetCDF.Attributes.GLOB_DESCRIPTION, "Matrix created by MOAPI through addition of 2 matrices");

	return ncfile;

	}
	}

view raw biostars-74_1.java hosted with ❤ by GitHub

Use the above in the following way:

	package netCDF;

	import java.util.List;
	import ucar.ma2.*;
	import ucar.nc2.*;
	import java.io.IOException;

	/**
	*****
	* @author Fernando Muñiz Fernandez
	* IBE, Institute of Evolutionary Biology (UPF-CSIC)
	* CEXS-UPF-PRBB</p>
	*****
	* THIS TO GENERATE A netCDF-3 GENOTYPE DB
	*/

	public class TestWriteNetcdf {

	public static void main(String[] arg) throws InvalidRangeException, IOException {

	NetcdfFileWriteable ncfile = netCDF.CreateNetcdf.setDimsAndAttributes(0,
	"INTERNAL",
	"test in TestWriteNetcdf",
	"+/-",
	5,
	10);

	// create the file
	try {
	ncfile.create();
	} catch (IOException e) {
	System.err.println("ERROR creating file "+ncfile.getLocation()+"\n"+e);
	}


	////////////// FILL'ER UP! ////////////////
	List<Dimension>; dims = ncfile.getDimensions();
	Dimension samplesDim = dims.get(0);
	Dimension markersDim = dims.get(1);
	Dimension markerSpanDim = dims.get(2);

	ArrayChar charArray = new ArrayChar.D3(samplesDim.getLength(),markersDim.getLength(),markerSpanDim.getLength());
	int i,j;
	Index ima = charArray.getIndex();


	int method = 1;
	switch (method) {
	case 1:
	// METHOD 1: Feed the complete genotype in one go
	for (i=0; i<samplesDim.getLength(); i++) {
	for (j=0; j<markersDim.getLength(); j++) {
	char c = (char) ((char) j + 65);
	String s = Character.toString(c) + Character.toString(c);
	charArray.setString(ima.set(i,j,0),s);
	System.out.println("SNP: "+i);
	}
	}
	break;
	case 2:
	//METHOD 2: One snp at a time -> feed in all samples
	for (i=0; i<markersDim.getLength(); i++) {
	charArray.setString(ima.set(i,0), "s"+i+"I0");
	System.out.println("SNP: "+i);
	}
	break;
	case 3:
	//METHOD 3: One sample at a time -> feed in all snps
	break;
	}



	int[] offsetOrigin = new int[3]; //0,0
	try {
	ncfile.write("genotypes", offsetOrigin, charArray);
	//ncfile.write("genotype", origin, A);
	} catch (IOException e) {
	System.err.println("ERROR writing file");
	} catch (InvalidRangeException e) {
	e.printStackTrace();
	}

	// close the file
	try {
	ncfile.close();
	} catch (IOException e) {
	System.err.println("ERROR creating file "+ncfile.getLocation()+"\n"+e);
	}

	}
	}

view raw biostars-74_2.java hosted with ❤ by GitHub

ADD COMMENT • link updated 6.9 years ago by Ram 45k • written 15.4 years ago by Fernando Muñiz ▴ 100

Entering edit mode

Very interesting. I'll flag this answer as "correct", but please, if anybody knows some source code that would highlight the power of HDF5. Please, feel free to post it here. Thanks

ADD REPLY • link 15.4 years ago by Pierre Lindenbaum 166k

Entering edit mode

Unfortunately, this answer is incorrect with respect to the question.

ADD REPLY • link 11.7 years ago by Michael 56k

Entering edit mode

you're right, but 3,7 years later, I don't think HDF5 will help me :-)

ADD REPLY • link 11.7 years ago by Pierre Lindenbaum 166k

Entering edit mode

15.4 years ago

Fernando Muñiz ▴ 100

Hello Pierre!

I have been talking with the BioHDF guys and from what they tell me, their work will be centered around a number of command-line APIs, written in C, that will address some areas of usage which for now do not seem to overlap.

I have seen this example on their site: http://www.hdfgroup.org/projects/biohdf/biohdf_tools.html Don't know if that helps.

I have been talking with them to see if we can achieve an API for saving genotype data. Don't know yet where that will lead me.

If you are looking for something more versatile, you will probably have to delve in the official HDF5 C code ( http://www.hdfgroup.org/HDF5/Tutor/ ), which seems to be the only one that offers all the functionality and goodies of that impressive storage system.

ADD COMMENT • link 15.4 years ago by Fernando Muñiz ▴ 100

Entering edit mode

Many thanks Fernando. As I said , I'm especially looking for some source code: e.g. I'd like to see a short program that would store/retrieve data just to see/understand why I should use this HDF instead of a classic RDBM or another engine (berkeleydb, couchdb...)

ADD REPLY • link 15.4 years ago by Pierre Lindenbaum 166k

Entering edit mode

There are Perl-based bindings to both HDF5 and BioHDF here, along with some docs:

ftp://ftp.hdfgroup.uiuc.edu/pub/outgoing/BioHDF/Perl/

ADD REPLY • link 15.4 years ago by Chris Fields ★ 2.2k

Entering edit mode

In case you did not look at it yet: http://www.hdfgroup.org/HDF5/doc/H5.intro.html http://www.hdfgroup.org/HDF5/doc/Intro/IntroExamples.html

ADD REPLY • link 15.4 years ago by Darked89 4.7k

Entering edit mode

15.4 years ago

Michael 56k

There is also a Perl binding to HDF5: PDL::IO::HDF5

http://search.cpan.org/~cerney/PDL-IO-HDF5-0.5/ This requires the Perl Data Language (PDL) package. The way, data-structures can be handled, sub-ranges of data can be defined an data can be manipulated is actually very elegant in PDL such that computational code can profit from PDLs vectorized style of writing expressions.

The same is true for R and the hdf5 package: http://cran.r-project.org/web/packages/hdf5/index.html

Code examples are in the package documentations of both, the R-hdf5 package documentation is quite little though.

Both of these language bindings might be a very efficient way to read and write HDF5 files.

There are also APIs in Fortran, Java, Python, Matlab, C, or C++. So it might make sense to select the language and define the type of data you wish to store first.

ADD COMMENT • link 15.4 years ago by Michael 56k

Entering edit mode

15.4 years ago

Chris Fields ★ 2.2k

I've been talking a bit with one of the devs behind BioHDF (being at UIUC, up the road from The HDF Group doesn't hurt). I believe a publication is on the way describing it along with some implementation details.

ADD COMMENT • link 15.4 years ago by Chris Fields ★ 2.2k

Entering edit mode

15.4 years ago

Darked89 4.7k

Not BioHDF5 but probably readable and maintained:

HDF5 for Python http://code.google.com/p/h5py/

ADD COMMENT • link 15.4 years ago by Darked89 4.7k

Entering edit mode

also pytables: http://www.pytables.org/moin

ADD REPLY • link 15.4 years ago by Istvan Albert 102k

Entering edit mode

15.4 years ago

Giovanni M Dall'Olio 28k

Unfortunately I don't have any example to shows you yet. I don't know how to program in C/C++ so I have been looking at two hdf5 wrappers in python, PyTables and H5PY.

PyTables has a database-like approach in which HDF5 is used as a sort of hierarchical database, in which a column can be a table itself, allowing to store nested data. For example, you have a table called 'SNPs' with two columns, 'id' and 'genotypes'; the column 'genotypes' contains a nested table, with the columns 'individual' and 'genotype'; and so on.

H5Py is basically a re-implementation of numpy's arrays, so you can store and access arrays/matrixes as you would do with numpy (it is similar to arrays and matrixes in matlab, R, and any other language with this data type) and they are stored in an HDF5 file so the access is faster.

ADD COMMENT • link 15.4 years ago by Giovanni M Dall'Olio 28k

Entering edit mode

one point to clarify is that with the nesting in pytables, you only get the equivalent of 1 row in the nest. so you'd have:

row['genotypes/genotype'] = 'XXX'

so while it is nested, you dont actually get a many-to-one relationship. still, i find pytables to be excellent for many things.

ADD REPLY • link 14.7 years ago by brentp 24k

Entering edit mode

15.4 years ago

Michael Hoffman ▴ 330

Our Genomedata system stores multiple tracks of 1-bp resolution genomic data in a HDF5 array. Documentation and full source code is available on that page. It has a Python (PyTables) interface for reading the data. For originally loading it into HDF5, we wrote a C loader for added speed.

ADD COMMENT • link 15.4 years ago by Michael Hoffman ▴ 330

Entering edit mode

thanks, that's very useful too

ADD REPLY • link 15.4 years ago by Pierre Lindenbaum 166k

Entering edit mode

14.7 years ago

Pierre Lindenbaum 166k

Another answer: I've been looking at the IGV ( Integrative Genomics Viewer ) sources, it use the JAVA bindings for HDF5 to store the data about the genomic tracks.

ADD COMMENT • link 14.7 years ago by Pierre Lindenbaum 166k