I have tried to do this for my course, and every single time I found that matching metadata to SRR numbers was incredibly convoluted,
lacked any kind of support for automated processing, sometimes you guess from file decoding weird acronyms,
to see all the metadata, and the mindboggling redundancy and lack of modeling that is out there do a
pip install bio
then
bio search SRR14575325 --all
then watch and weep as you see the enormous number of useful and useless fields, non of which are properly filled in
[
{
"age": "",
"aligned": "",
"altitude": "",
"assembly_quality": "",
"assembly_software": "",
"bam_aspera": "",
"bam_bytes": "",
"bam_ftp": "",
"bam_galaxy": "",
"bam_md5": "",
"base_count": "971964750",
"binning_software": "",
"bio_material": "",
"bisulfite_protocol": "",
"broad_scale_environmental_context": "",
"broker_name": "",
"cage_protocol": "",
"cell_line": "",
"cell_type": "",
"center_name": "GEO",
"checklist": "",
"chip_ab_provider": "",
"chip_protocol": "",
"chip_target": "",
"collected_by": "",
"collection_date": "",
"collection_date_end": "",
"collection_date_start": "",
"completeness_score": "",
"contamination_score": "",
"control_experiment": "",
"country": "",
"cultivar": "",
"culture_collection": "",
"datahub": "",
"depth": "",
"description": "Illumina HiSeq 2000 sequencing: GSM5320434: TG3_1 Homo sapiens miRNA-Seq",
"dev_stage": "",
"disease": "",
"dnase_protocol": "",
"ecotype": "",
"elevation": "",
"environment_biome": "",
"environment_feature": "",
"environment_material": "",
"environmental_medium": "",
"environmental_sample": "",
"experiment_accession": "SRX10918471",
"experiment_alias": "GSM5320434",
"experiment_target": "",
"experiment_title": "Illumina HiSeq 2000 sequencing: GSM5320434: TG3_1 Homo sapiens miRNA-Seq",
"experimental_factor": "",
"experimental_protocol": "",
"extraction_protocol": "",
"faang_library_selection": "",
"fastq_aspera": "fasp.sra.ebi.ac.uk:/vol1/fastq/SRR145/025/SRR14575325/SRR14575325.fastq.gz",
"fastq_bytes": "612817621",
"fastq_galaxy": "ftp.sra.ebi.ac.uk/vol1/fastq/SRR145/025/SRR14575325/SRR14575325.fastq.gz",
"fastq_md5": "4a6120e81b28ef2552dbeb7027f932fb",
"file_location": "",
"first_created": "2021-05-20",
"first_public": "2021-05-20",
"germline": "",
"hi_c_protocol": "",
"host": "",
"host_body_site": "",
"host_genotype": "",
"host_gravidity": "",
"host_growth_conditions": "",
"host_phenotype": "",
"host_scientific_name": "",
"host_sex": "",
"host_status": "",
"host_tax_id": "",
"identified_by": "",
"instrument_model": "Illumina HiSeq 2000",
"instrument_platform": "ILLUMINA",
"investigation_type": "",
"isolate": "",
"isolation_source": "",
"last_updated": "2021-05-20",
"lat": "",
"library_construction_protocol": "Liver tissues were removed, flash frozen on dry ice, and RNA was harvested using Trizol reagent. Illumina TruSeq RNA Sample Prep Kit (Cat#FC-122-1001) was used with 1 ug of total RNA for the construction of sequencing libraries. RNA libraries were prepared for sequencing using standard Illumina protocols",
"library_gen_protocol": "",
"library_layout": "SINGLE",
"library_max_fragment_size": "",
"library_min_fragment_size": "",
"library_name": "",
"library_pcr_isolation_protocol": "",
"library_prep_date": "",
"library_prep_date_format": "",
"library_prep_latitude": "",
"library_prep_location": "",
"library_prep_longitude": "",
"library_selection": "size fractionation",
"library_source": "TRANSCRIPTOMIC",
"library_strategy": "miRNA-Seq",
"local_environmental_context": "",
"location": "",
"location_end": "",
"location_start": "",
"lon": "",
"marine_region": "",
"mating_type": "",
"ncbi_reporting_standard": "Generic",
"nominal_length": "",
"nominal_sdev": "",
"pcr_isolation_protocol": "",
"ph": "",
"project_name": "Identification of 5'isomiR in HCC patients.",
"protocol_label": "",
"read_count": "19439295",
"read_strand": "",
"restriction_enzyme": "",
"restriction_enzyme_target_sequence": "",
"restriction_site": "",
"rna_integrity_num": "",
"rna_prep_3_protocol": "",
"rna_prep_5_protocol": "",
"rna_purity_230_ratio": "",
"rna_purity_280_ratio": "",
"rt_prep_protocol": "",
"run_accession": "SRR14575325",
"run_alias": "GSM5320434_r1",
"run_date": "",
"salinity": "",
"sample_accession": "SAMN19241174",
"sample_alias": "GSM5320434",
"sample_capture_status": "",
"sample_collection": "",
"sample_description": "TG3_1",
"sample_material": "",
"sample_prep_interval": "",
"sample_prep_interval_units": "",
"sample_storage": "",
"sample_storage_processing": "",
"sample_title": "TG3_1",
"sampling_campaign": "",
"sampling_platform": "",
"sampling_site": "",
"scientific_name": "Homo sapiens",
"secondary_project": "",
"secondary_sample_accession": "SRS9008346",
"secondary_study_accession": "SRP320296",
"sequencing_date": "",
"sequencing_date_format": "",
"sequencing_location": "",
"sequencing_longitude": "",
"sequencing_method": "",
"sequencing_primer_catalog": "",
"sequencing_primer_lot": "",
"sequencing_primer_provider": "",
"serotype": "",
"serovar": "",
"sex": "",
"specimen_voucher": "",
"sra_aspera": "fasp.sra.ebi.ac.uk:/vol1/srr/SRR145/025/SRR14575325",
"sra_bytes": "604854335",
"sra_ftp": "ftp.sra.ebi.ac.uk/vol1/srr/SRR145/025/SRR14575325",
"sra_galaxy": "ftp.sra.ebi.ac.uk/vol1/srr/SRR145/025/SRR14575325",
"sra_md5": "137a7ea70991c3b85f4e5e1aa3d3ac91",
"status": "public",
"strain": "",
"study_accession": "PRJNA730731",
"study_alias": "GSE174608",
"study_title": "Identification of 5'isomiR in HCC patients.",
"sub_species": "",
"sub_strain": "",
"submission_accession": "SRA1233621",
"submission_tool": "",
"submitted_aspera": "",
"submitted_bytes": "",
"submitted_format": "",
"submitted_ftp": "",
"submitted_galaxy": "",
"submitted_host_sex": "",
"submitted_md5": "",
"submitted_read_type": "",
"tag": "",
"target_gene": "",
"tax_id": "9606",
"taxonomic_classification": "",
"taxonomic_identity_marker": "",
"temperature": "",
"tissue_lib": "",
"tissue_type": "",
"transposase_protocol": "",
"variety": "",
"fastq_url": [
"https://ftp.sra.ebi.ac.uk/vol1/fastq/SRR145/025/SRR14575325/SRR14575325.fastq.gz"
],
"info": "613 MB files; 19.4 million reads; 972.0 million sequenced bases"
}
]
Ingenuity Pathway Analysis (Qiagen) has probably done this already with datasets they curate in the tool called "Analysis Match". Not a free option but in case you already have access to IPA then this should cover a large portion of human data (which I assume your primary interest will be).
thanks - i would need to obtain all of it, which i dont think they would sell me. i located some promising angles in the mean time, including a GPT that will mine PDFs as well as links and attachments.
i wont tackle this in earnest for a couple weeks or months, but i will update or post as tutorial when i do.
VAL