#!/usr/bin/perl use strict; use warnings; # gbf2tbl.pl # NCBI is supplying this Perl script for the convenience of GenBank # submitters who record their genome annotation in a format similar to # GenBank or EMBL reports. The gbf2tbl.pl parser allows some variation from # the official strict flat file conventions. Additional idiosyncrasies in # format can be supported by modification of the script. Any suggestions for # improvements or corrections are welcome. However, given the number of # possible variations of local data files and conditions, NCBI cannot # promise to implement them all and cannot be expected to troubleshoot local # installations. This script is being supplied as is, but you are always # welcome to make any changes or improvements yourself. # # The script produces annotated FASTA and 5-column feature table files, # which are then read by tbl2asn to complete the submission preparation # process. Submitters are expected to provide data that meet the annotation # criteria for acceptance by GenBank. This includes annotating coding # regions and structural RNAs, providing a gene feature with a unique # /locus_tag qualifier for each, and confirming that all CDS features # translate into proteins without error. # # For convenience, tbl2asn also runs the NCBI sequence record validator, # which checks the resulting record for many kinds of errors and # inconsistencies. Even syntactically correct records will be rejected if # they do not contain information required by GenBank policy. Examining the # validator output from tbl2asn and correcting the underlying data is # critical to ensure that submissions are acceptable. # # Existing examples from several annotation pipeline outputs have been used # to build the parser and have it accommodate information as presented. # Additional conversions are described below: # # If the ACCESSION is blank or "unknown", the sequence identifier is taken # from the LOCUS line. # # If the source feature is missing, the taxonomic name is taken from the # ORGANISM line. # # Feature keys or qualifiers that are not present in the INSDC feature table # document are reported in a .err file, and are not added to the 5-column # feature table. See http://www.insdc.org/documents/feature_table.html for # details. # # A few non-standard qualifiers are allowed and are converted to the # appropriate field in the output file. These include codon_recognized, # go_component, and region_name. # # Feature intervals that refer to 'far' locations, i.e., those not within # the cited record and which have an accession and colon, are suppressed. # Those rare features (e.g., trans-splicing between molecules) must be # annotated later using Sequin. # # The genetic code in the FASTA definition line, necessary for proper # translation of protein coding regions, is taken from a CDS /transl_table # qualifier. # # A companion script, tblfix.pl, can perform various data conversions on the # resulting feature table files, with the conversion function specified by a # command line argument. # Script to convert pseudo-GenBank or pseudo-EMBL files to FASTA and # 5-column feature table files suitable for submission to NCBI using # the tbl2asn or Sequin programs. my $gbffile = shift or die "Must supply input filename\n"; open (my $GBF_IN, $gbffile) or die "Unable to open $gbffile\n"; my $base = $gbffile; if ($base =~ /^(.+)\.gbf$/ || $base =~ /^(.+)\.gbk$/ || $base =~ /^(.+)\.gb$/ || $base =~ /^(.+)\.embl$/ || $base =~ /^(.+)\.emb$/ || $base =~ /^(.+)\.eb$/ || $base =~ /^(.+)\.art$/) { $base = $1; } open (my $FSA_OUT, ">$base.fsa") or die "Unable to open sequence output file\n"; open (my $TBL_OUT, ">$base.tbl") or die "Unable to open feature table output file\n"; open (my $ERR_OUT, ">$base.err") or die "Unable to open error output file\n"; # define global variables my $has_errors = 0; my $line_number = 0; # state variables for tracking current position in flatfile my $in_seq; my $in_feat; my $in_key; my $in_qual; my $current_key; my $current_loc; my $current_qual; my $current_val; my $organism; my $topology; my $is_source; my $is_translation; my $transl_table; my $thisline; my $curr_seq; my $locus; my $accn; my $is_order; my $organism_ok; my $printed_heading; # subroutine to clear state variables for each flatfile # start in in_feat state to gracefully handle missing FEATURES/FH line sub clearflags { $in_seq = 0; $in_feat = 1; $in_key = 0; $in_qual = 0; $current_key = ""; $current_loc = ""; $current_qual = ""; $current_val = ""; $organism = ""; $topology = ""; $is_source = 0; $is_translation = 0; $transl_table = 1; $thisline = ""; $curr_seq = ""; $locus = ""; $accn = ""; $is_order = 0; $organism_ok = 0; $printed_heading = 0; } # hashes for confirming legal feature keys and legal qualifier names my %legal_keys = (); my %legal_quals = (); sub createlists { my @keys = qw/3clip 3UTR 5clip 5UTR 10_signal35_signal allele attenuator C_region CAAT_signal CDS centromere conflict D_loop D_segment enhancer exon gap GC_signal gene iDNA intron J_segment LTR mat_peptide misc_binding misc_difference misc_feature misc_recomb misc_RNA misc_signal misc_structure mobile_element modified_base mRNA mutation N_region ncRNA old_sequence operon oriT polyA_signal polyA_site precursor_RNA preprotein preRNA prim_transcript primer_bind promoter protein_bind RBS rep_origin repeat_region repeat_unit rRNA S_region satellite scRNA sig_peptide site_ref snoRNA snRNA source stem_loop STS TATA_signal telomere terminator tmRNA transit_peptide tRNA unsure V_region V_segment variation virion/; foreach my $thiskey (@keys) { $legal_keys{lc($thiskey)} = 1; } my @quals = qw/allele anticodon artificial_location bio_material bound_moiety cell_line cell_type chloroplast chromoplast chromosome citation clone_lib clone codon_start codon collected_by collection_date compare cons_splice country cultivar culture_collection cyanelle db_xref dev_stage direction EC_number ecotype environmental_sample estimated_length evidence exception experiment focus frequency function gap_type gdb_xref gene_synonym gene germline haplogroup haplotype identified_by inference insertion_seq isolate isolation_source kinetoplast lab_host label lat_lon linkage_evidence locus_tag macronuclear map mating_type metagenomic mitochondrion mobile_element_type mobile_element mod_base mol_type ncRNA_class note number old_locus_tag operon organelle organism partial PCR_conditions PCR_primers phenotype plasmid pop_variant product protein_id proviral pseudo pseudogene rearranged replace ribosomal_slippage rpt_family rpt_type rpt_unit_range rpt_unit_seq rpt_unit satellite segment sequenced_mol serotype serovar sex specific_host specimen_voucher standard_name strain sub_clone sub_species sub_strain tag_peptide tissue_lib tissue_type trans_splicing transcript_id transgenic transl_except transl_table translation transposon UniProtKB_evidence usedin variety virion go_component go_function go_process codon_recognized bond_type gene_desc gene_syn prot_desc prot_note region_name site_type/; foreach my $thisqual (@quals) { $legal_quals{lc($thisqual)} = 1; } } # recursive subroutine for parsing flatfile representation of feature location sub parseloc { my $subloc = shift (@_); my @working = (); if ($subloc =~ /^join\((.+)\)$/) { my $temploc = $1; my @items = split (',', $temploc); foreach my $thisloc (@items) { if ($thisloc !~ /^.*:.*$/) { push (@working, parseloc ($thisloc)); } } } elsif ($subloc =~ /^order\((.+)\)$/) { $is_order = 1; my $temploc = $1; my @items = split (',', $temploc); foreach my $thisloc (@items) { if ($thisloc !~ /^.*:.*$/) { push (@working, parseloc ($thisloc)); } } } elsif ($subloc =~ /^complement\((.+)\)$/) { my $comploc = $1; my @items = parseloc ($comploc); my @rev = reverse (@items); foreach my $thisloc (@rev) { if ($thisloc =~ /^([^.]+)\.\.([^.]+)$/) { $thisloc = "$2..$1"; } if ($thisloc =~ /^>([^.]+)\.\.([^.]+)$/) { $thisloc = "<$1..$2"; } if ($thisloc =~ /^([^.]+)\.\.<([^.]+)$/) { $thisloc = "$1..>$2"; } if ($thisloc !~ /^.*:.*$/) { push (@working, parseloc ($thisloc)); } } } elsif ($subloc !~ /^.*:.*$/) { push (@working, $subloc); } return @working; } #subroutine to print next feature key / location / qualifier line sub flushline { if ($printed_heading == 0) { if ($accn eq "" || $accn =~ /^unknown.*/) { $accn = $locus; } # report identifier print $FSA_OUT ">$accn"; print $TBL_OUT ">Feature $accn\n"; $printed_heading = 1; } if ($in_key == 1) { if (! $legal_keys{lc($current_key)}) { print $ERR_OUT "Bad feature\tLine $line_number\t$current_key\n"; $has_errors = 1; } elsif ($is_source == 0) { # parse join() order() complement() ###..### location $is_order = 0; my @theloc = parseloc ($current_loc); # convert number (dot) (dot) number to number (tab) number foreach my $thisloc (@theloc) { if ($thisloc =~ /^([^.]+)\.\.([^.]+)$/) { $thisloc = "$1\t$2"; } elsif ($thisloc =~ /^(.+)\^(.+)$/) { $thisloc = "$1\^\t$2"; } elsif ($thisloc =~ /^([^.]+)$/) { $thisloc = "$1\t$1"; } } #print feature key and intervals my $first = shift (@theloc); print $TBL_OUT "$first\t$current_key\n"; foreach my $thisloc (@theloc) { print $TBL_OUT "$thisloc\n"; } if ($is_order == 1) { # generate order qualifier to force use of order instead of join print $TBL_OUT "\t\t\torder\n"; } } } elsif ($in_qual == 1) { if (! $legal_quals{lc($current_qual)}) { print $ERR_OUT "Bad qualifier\tLine $line_number\t$current_qual\n"; $has_errors = 1; } elsif (! $legal_keys{lc($current_key)}) { $has_errors = 1; } elsif ($is_source == 1) { if ($current_val eq "") { print $FSA_OUT " [$current_qual=]"; } else { print $FSA_OUT " [$current_qual=$current_val]"; if ($current_qual eq "organism") { $organism = ""; $organism_ok = 1; } } } elsif ($current_qual ne "translation") { if ($current_val eq "") { print $TBL_OUT "\t\t\t$current_qual\n"; } else { print $TBL_OUT "\t\t\t$current_qual\t$current_val\n"; } } } } # initialize flags and lists at start of program clearflags (); createlists (); # main loop reads one line at a time while ($thisline = <$GBF_IN>) { $thisline =~ s/\r//; $thisline =~ s/\n//; $line_number++; if ($thisline =~ /^LOCUS\s+(\S*).*$/ || $thisline =~ /^ID\s+(\S*);.*$/ || $thisline =~ /^ID\s+(\S*)\s+SV\s+\d+;.*$/) { # record locus $locus = $1; if ($thisline =~ /^.*(linear).*$/ || $thisline =~ /^.*(circular).*$/) { $topology = $1; } } elsif ($thisline =~ /^ACCESSION\s*(\S*).*$/ || $thisline =~ /^AC\s*(.*);.*$/) { # record accession $accn = $1; } elsif ($thisline =~ /^ {1,3}ORGANISM\s+(.*)$/ || $thisline =~ /^OS\s+(.*)$/) { # record organism $organism = $1; if ($organism =~ /^([^(]*) \(.*\)/) { $organism = $1; } } elsif ($thisline =~ /^FEATURES\s+.*$/ || $thisline =~ /^FH\s+.*$/) { # beginning of feature table, flags already set up } elsif ($thisline =~ /^ORIGIN\s*.*$/ || $thisline =~ /^SQ\s*.*$/) { # end of feature table, print final newline flushline (); if ($in_seq == 0) { if ($organism ne "") { print $FSA_OUT " [organism=$organism]"; $organism_ok = 1; } if ($topology ne "") { print $FSA_OUT " [topology=$topology]"; } if ($transl_table > 1) { print $FSA_OUT " [gcode=$transl_table]"; } print $FSA_OUT "\n"; } $in_feat = 0; $in_key = 0; $in_qual = 0; $is_source = 0; $is_translation = 0; $in_seq = 1; } elsif ($thisline =~ /^\/\/\.*/) { if ($organism_ok == 0) { print $ERR_OUT "ERROR - No organism found\n"; } # at end-of-record double slash, reset variables for catenated flatfiles clearflags (); } elsif ($in_seq == 1) { if ($thisline =~ /^\s+\d+ (.*)$/ || $thisline =~ /^\s+(.*)\s+\d+$/) { # report sequence $curr_seq = $1; $curr_seq =~ s/ //g; $curr_seq = uc $curr_seq; print $FSA_OUT "$curr_seq\n"; } } elsif ($in_feat == 1) { if ($thisline =~ /^ {1,10}(\w+)\s+(.*)$/ || $thisline =~ /^FT (\w+)\s+(.*)$/) { # new feature key and location flushline (); $in_key = 1; $in_qual = 0; $current_key = $1; $current_loc = $2; if ($current_key =~ /source/ || $current_key =~ /Source/) { $is_source = 1; } else { $is_source = 0; } } elsif ($thisline =~ /^\s+\/(\w+)=(.*)$/ || $thisline =~ /^FT\s+\/(\w+)=(.*)$/) { # new qualifier flushline (); $in_key = 0; $in_qual = 1; $current_qual = $1; # remove leading double quote my $val = $2; $val =~ s/\"//g; $current_val = $val; if ($current_qual =~ /translation/) { $is_translation = 1; } else { $is_translation = 0; } if ($current_qual =~ /transl_table/) { $transl_table = $current_val; } } elsif ($thisline =~ /^\s+\/(\w+)$/ || $thisline =~ /^FT\s+\/(\w+)$/) { # new singleton qualifier - e.g., trans-splicing, pseudo flushline (); $in_key = 0; $in_qual = 1; $current_qual = $1; $current_val = ""; $is_translation = 0; } elsif ($thisline =~ /^\s+(.*)$/ || $thisline =~ /^FT\s+(.*)$/) { if ($in_key == 1) { # continuation of feature location $current_loc = $current_loc . $1; } elsif ($in_qual == 1) { # continuation of qualifier # remove trailing double quote my $val = $1; $val =~ s/\"//g; if ($is_translation == 1) { $current_val = $current_val . $val; } else { $current_val = $current_val . " " . $val; } } } } } # close input and output files close ($GBF_IN); close ($FSA_OUT); close ($TBL_OUT); close ($ERR_OUT); # if no bad features or qualifiers, remove error file if (! $has_errors) { unlink ("$base.err"); }