Hi I am currently following the tutorial of DEXSeq to do a differential Exon usage analysis on my dataset. I am currently at the step where I read my dataset in to R to construct a DEXSeqDataSet using the function DEXSeqDataSetFromHTSeq()
.
dxd = DEXSeqDataSetFromHTSeq(
countFiles,
sampleData=sampleTable,
design= ~ sample + exon + condition:exon,
flattenedfile=flattenedFile )
I have total 4 countFiles which I constructed from my actual dataset to apply this function on a smaller dataset. The countFiles look like following:
RNA1_test.txt:
""ENSG00000000003.14"":"001" 36
""ENSG00000000003.14"":"002" 183
""ENSG00000000003.14"":"003" 45
""ENSG00000000003.14"":"004" 0
""ENSG00000000003.14"":"005" 32
""ENSG00000000003.14"":"006" 21
""ENSG00000000003.14"":"007" 22
""ENSG00000000003.14"":"008" 24
""ENSG00000000003.14"":"009" 26
""ENSG00000000003.14"":"010" 17
""ENSG00000000003.14"":"011" 34
""ENSG00000000003.14"":"012" 10
""ENSG00000000003.14"":"013" 12
""ENSG00000000003.14"":"014" 7
""ENSG00000000003.14"":"015" 2
""ENSG00000000003.14"":"016" 0
""ENSG00000000003.14"":"017" 0
RNA2_test.txt:
""ENSG00000000003.14"":"001" 34
""ENSG00000000003.14"":"002" 191
""ENSG00000000003.14"":"003" 60
""ENSG00000000003.14"":"004" 0
""ENSG00000000003.14"":"005" 41
""ENSG00000000003.14"":"006" 36
""ENSG00000000003.14"":"007" 42
""ENSG00000000003.14"":"008" 50
""ENSG00000000003.14"":"009" 47
""ENSG00000000003.14"":"010" 26
""ENSG00000000003.14"":"011" 38
""ENSG00000000003.14"":"012" 5
""ENSG00000000003.14"":"013" 18
""ENSG00000000003.14"":"014" 7
""ENSG00000000003.14"":"015" 1
""ENSG00000000003.14"":"016" 1
""ENSG00000000003.14"":"017" 0
etc...
The amount of lines the transcripts/Exons in the the 4 countFiles are identical only the counts are different. I also used a sample of the actual flattened gff file that was used for the counting the reads :
chrX dexseq_prepare_annotation.py aggregate_gene 100627109 100639991 . - . gene_id ""ENSG00000000003.14""
chrX dexseq_prepare_annotation.py exonic_part 100627109 100628669 . - . gene_id ""ENSG00000000003.14""; transcripts ""ENST00000612152.4""; exonic_part_number "001"
chrX dexseq_prepare_annotation.py exonic_part 100628670 100629986 . - . gene_id ""ENSG00000000003.14""; transcripts ""ENST00000373020.8"+"ENST00000612152.4""; exonic_part_number "002"
chrX dexseq_prepare_annotation.py exonic_part 100630759 100630866 . - . gene_id ""ENSG00000000003.14""; transcripts ""ENST00000373020.8"+"ENST00000612152.4""; exonic_part_number "003"
chrX dexseq_prepare_annotation.py exonic_part 100632063 100632068 . - . gene_id ""ENSG00000000003.14""; transcripts ""ENST00000614008.4""; exonic_part_number "004"
chrX dexseq_prepare_annotation.py exonic_part 100632485 100632540 . - . gene_id ""ENSG00000000003.14""; transcripts ""ENST00000373020.8"+"ENST00000614008.4""; exonic_part_number "005"
chrX dexseq_prepare_annotation.py exonic_part 100632541 100632568 . - . gene_id ""ENSG00000000003.14""; transcripts ""ENST00000373020.8"+"ENST00000496771.5"+"ENST00000614008.4""; exonic_part_number "006"
chrX dexseq_prepare_annotation.py exonic_part 100633405 100633441 . - . gene_id ""ENSG00000000003.14""; transcripts ""ENST00000373020.8"+"ENST00000496771.5"+"ENST00000614008.4"+"ENST00000612152.4""; exonic_part_number "007"
chrX dexseq_prepare_annotation.py exonic_part 100633442 100633539 . - . gene_id ""ENSG00000000003.14""; transcripts ""ENST00000614008.4"+"ENST00000612152.4"+"ENST00000494424.1"+"ENST00000496771.5"+"ENST00000373020.8""; exonic_part_number "008"
chrX dexseq_prepare_annotation.py exonic_part 100633931 100634029 . - . gene_id ""ENSG00000000003.14""; transcripts ""ENST00000614008.4"+"ENST00000612152.4"+"ENST00000494424.1"+"ENST00000496771.5"+"ENST00000373020.8""; exonic_part_number "009"
chrX dexseq_prepare_annotation.py exonic_part 100635178 100635252 . - . gene_id ""ENSG00000000003.14""; transcripts ""ENST00000614008.4"+"ENST00000612152.4"+"ENST00000494424.1"+"ENST00000496771.5"+"ENST00000373020.8""; exonic_part_number "010"
chrX dexseq_prepare_annotation.py exonic_part 100635558 100635746 . - . gene_id ""ENSG00000000003.14""; transcripts ""ENST00000614008.4"+"ENST00000612152.4"+"ENST00000494424.1"+"ENST00000496771.5"+"ENST00000373020.8""; exonic_part_number "011"
chrX dexseq_prepare_annotation.py exonic_part 100636191 100636607 . - . gene_id ""ENSG00000000003.14""; transcripts ""ENST00000496771.5""; exonic_part_number "012"
chrX dexseq_prepare_annotation.py exonic_part 100636608 100636689 . - . gene_id ""ENSG00000000003.14""; transcripts ""ENST00000373020.8"+"ENST00000496771.5""; exonic_part_number "013"
chrX dexseq_prepare_annotation.py exonic_part 100636690 100636792 . - . gene_id ""ENSG00000000003.14""; transcripts ""ENST00000373020.8""; exonic_part_number "014"
chrX dexseq_prepare_annotation.py exonic_part 100636793 100636806 . - . gene_id ""ENSG00000000003.14""; transcripts ""ENST00000373020.8"+"ENST00000614008.4"+"ENST00000494424.1"+"ENST00000612152.4""; exonic_part_number "015"
chrX dexseq_prepare_annotation.py exonic_part 100636807 100637104 . - . gene_id ""ENSG00000000003.14""; transcripts ""ENST00000614008.4"+"ENST00000494424.1"+"ENST00000612152.4""; exonic_part_number "016"
chrX dexseq_prepare_annotation.py exonic_part 100639945 100639991 . - . gene_id ""ENSG00000000003.14""; transcripts ""ENST00000494424.1""; exonic_part_number "017"
However when I run my code :
#get count file names
countFiles= list.files(path = "Desktop",pattern = "RNA", full.names = TRUE)
basename(countFiles)
[1] "RNA1_test.txt" "RNA2_test.txt" "RNA3_test.txt" "RNA4_test.txt"
#get gff file name
flattenedFile = list.files(path = "Desktop", pattern="gff$", full.names=TRUE)
basename(flattenedFile)
[1] "dexseq_test.gff"
sampleTable = data.frame(
row.names = c( "retina1", "retina2",
"brain1", "brain2" ),
condition = c("experiment", "experiment",
"control", "control" ))
dxd = DEXSeqDataSetFromHTSeq(
countFiles,
sampleData=sampleTable2,
design= ~ sample + exon + condition:exon,
flattenedfile=flattenedFile )
I receive the following error:
Error in FUN(X[[i]], ...) : subscript out of bound
I am not sure what might cause this error, since the gene ids in the gff file also match the ones used in my countFiles. Anyone familiar with this error?
One of the differences I noticed between my files and the files used in the example are the quotes " " in the countFiles -> ""ENSG00000000003.14"":"001" 34 , but im not sure if that could cause it