Don't know if it can help you, but we wrote these scripts a few years ago to 1) extract data and notes from series_matrix files, and then to kind of transform the generated .notes file into a phenoData file.
Usage is (you have to install the PerlIO::gzip module) :
extractData.pl GSExxxx_series_matrix.txt.gz output_dir
this will extract a GSExxxx.data and a GSExxxx.notes files into 'output_dir'. The GSExxx.data file is easily imported in R with read.table(). Colnames are GSM values.
To get a phenoData file from the .notes file, do :
notes2pData.pl GSExxxx.notes
hope it still works !
Julien
extractData.pl:
#!/usr/bin/perl -w
use strict;
use warnings;
use PerlIO::gzip;
open( FILE, "<:gzip", "$ARGV[0]" );
my $outputDir = $ARGV[1];
$ARGV[0] =~ /(GSE\d+)-?(GPL\d+)?_series_matrix.txt.gz$/;
my $GSE = $1;
my $GPL = $2;
my($prefix);
if(defined($GPL)) {
$prefix = "$GSE-$GPL"
}
else {
$prefix = "$GSE";
}
print $prefix;
open(MATRIX, ">$outputDir/$prefix.data");
open(NOTES, ">$outputDir/$prefix.notes");
my ( $gseTitle, $gseDescription, $gsePMID, @sampleIDs, @sampleTitles,
@sampleDescriptions, @sampleSrcCh1, @samplePlatforms, @sampleOrganism,
@platformIDs );
my $table = 0;
while(my $line = <FILE>) {
if($line =~ /^\![sS]eries_matrix_table_end/) {
$table = 0;
}
if($table == 1) {
$line =~ s/[\"\#]//g;
print MATRIX $line;
}
$line =~ s/[\r\n]//g;
if($line =~ /^\!Series_title[\s\t]+\"(.+)\"/) {
$gseTitle = $1;
}
elsif($line =~ /^\!Series_summary[\s\t]+\"(.+)\"/) {
$gseDescription .= $1;
}
elsif($line =~ /^\!Sample_geo_accession[\s\t]+\"(.+)\"/) {
my $tt = $1;
$tt =~ s/\"//g;
@sampleIDs = split(/\t/, $tt);
}
elsif($line =~ /^\!Series_pubmed_id[\s\t]+\"(.+)\"/) {
$gsePMID = $1;
}
elsif($line =~ /^\!Series_platform_id[\s\t]+\"(.+)\"/) {
push @platformIDs, $1;
}
elsif($line =~ /^\!Sample_title/) {
$line =~ s/\"//g;
my @t = split(/\t/, $line);
shift @t;
@sampleTitles = @t;
}
elsif($line =~ /^\!Sample_source_name_ch1/) {
$line =~ s/\"//g;
my @t = split(/\t/, $line);
shift @t;
@sampleSrcCh1 = @t;
}
elsif($line =~ /^\!Sample_organism_ch1/) {
$line =~ s/\"//g;
my @t = split(/\t/, $line);
shift @t;
@sampleOrganism = @t;
}
elsif($line =~ /^\!Sample_description/) {
$line =~ s/\"//g;
my @t = split(/\t/, $line);
shift @t;
for(my $i = 0; $i < scalar(@t); $i++) {
$sampleDescriptions[$i] .= $t[$i]." ";
}
}
elsif($line =~ /^\!Sample_platform_id/) {
$line =~ s/\"//g;
my @t = split(/\t/, $line);
shift @t;
@samplePlatforms = @t;
}
elsif($line =~ /^\![sS]eries_matrix_table_begin/) {
$table = 1;
}
}
#( $gseTitle, $gseDescription, $gsePMID, @sampleIDs, @sampleTitles,
# @sampleDescriptions, @sampleSrcCh1, @samplePlatforms, @sampleOrganism,
# @platformIDs )
print NOTES "GSE_ID = $GSE\n", "GSE_TITLE = $gseTitle\n", "GSE_DESC = $gseDescription\n", "GSE_PMID = $gsePMID\n";
if(defined($GPL)) {
print NOTES "PLATFORM = $GPL\n";
}
else {
print NOTES "PLATFORM = $samplePlatforms[0]\n";
}
print NOTES "NB_SAMPLES = ".scalar(@sampleIDs)."\n";
print NOTES "\n";
print NOTES "SAMPLE_IDS = ".join("\t",@sampleIDs)."\n",
"SAMPLE_TITLES = ".join("\t",@sampleTitles)."\n",
"SAMPLE_ORGANISMS = ".join("\t", @sampleOrganism)."\n",
"SAMPLE_SRC_CH1 = ".join("\t", @sampleSrcCh1)."\n",
"SAMPLE_DESC = ".join("\t", @sampleDescriptions)."\n";
close(MATRIX);
close(NOTES);