Hello I am new to GATK and I am attempting to follow the best practices for germline SNPs and Indels in Whole Exome sequencing. Here is my code:
import os, pprint
path = "/Users/seyfim/Directory/Marilyn/Eng_lamis Exom Hiseq Run 5-10-17/FASTQ Files/Eng_lamis-exome-38605575/CCF00239-01-001-46873888"
all=[ i for i in os.listdir(path) if i.endswith('.gz')]
for f in all:
print '\n\n\n\tsai making on ',f
os.system('bwa aln -t 32 -f '+f[:-9]+'.sai /Users/seyfim/Directory/Farshad/NEW_LIBRARY_FILES/hg19 '+f)
print '\n\n\tsai making has finished for ',f
all=[ i for i in os.listdir('.') if i.endswith('.sai')]
all.sort()
pprint.pprint(all)
while all:
first=all.pop(0)
second=all.pop(0)
name=first[5:-7]
print 'working on\n\n\t',first,'\n\t',second
os.system('bwa sampe -f '+name+'.sam -r "@RG\tID:0\tLB:'+name+'_LIB\tSM:'+name+'\tPL:ILLUMINA" /Users/seyfim/Directory/Farshad/NEW_LIBRARY_FILES/hg19 '+first+' '+second+' '+first[:-4]+'.fastq.gz '+second[:-4]+'.fastq.gz')
print 'Done working on ',name
If MeM:
path = "/Users/seyfim/Directory/Marilyn/Eng_lamis Exom Hiseq Run 5-10-17/FASTQ Files/Eng_lamis-exome-38605575/CCF00239-01-001-46873888"
all=[i for i in os.listdir(path) if i.endswith('gz')]
all.sort()
pprint.pprint(all)
while all:
fq1=all.pop(0)
fq2=all.pop(0)
f=fq1[:-9]
print 'working on\n\n\t',fq1,'\n\t',fq2,'\n\tAs ----> ',f
os.system('/Users/seyfim/software/bwa/bwa mem -M -t 24 -R "@RG\tID:bwa\tLB:'+f+'_LIB\tSM:'+f+'\tPL:ILLUMINA" /Users/seyfim/Directory/Farshad/NEW_LIBRARY_FILES/hg19.fa '+fq1+' '+fq2+' > '+f+'.sam')
I get this error:
Error details: SAM file doesn't have any read groups defined in the header. The GATK no longer supports SAM files without read groups
I thought I had taken care of this with
os.system('bwa sampe -f '+name+'.sam -r "@RG\tID:0\tLB:'+name+'_LIB\tSM:'+name+'\tPL:ILLUMINA"