<?xml version="1.0" encoding="US-ASCII"?>


<!-- ************************************************************* -->

<!--             GAME Genome Annotation Markup Elements            -->

<!--  Document Type Definition DTD - June. 5,1999  - Version 1.0   -->

<!--                        Suzanna E. Lewis                       -->
<!--                           Erwin Frise                         -->

<!--                University of California Berkeley              -->

<!--  $Id: game.dtd.html,v 1.1 2000/03/07 19:54:51 bradmars Exp $    -->

<!-- Annotations are a summarization of all the collected features discerned
and described on related sequences of genomic DNA, transcripts, mRNAs (and
cDNAs which are treated as their logical equivalent), and proteins. Each of
these molecules has regions along their linear length described by
annotators as 'features'. The features themselves are a combined summary of
both computational and genetic analysis of that DNA, RNA, or AA sequence.
Computational analyses are not considered 'features' and are treated as
primary data, as are any experimental analyses carried out at the bench. In
other words, analytical results may be used to identify features, but are
not considered features on their own in this context. Thus, each molecule is
described both in terms of primary analytical results and in terms of expert
defined features that are supported by the preceding results. The
combination of all these associated feature descriptions on the related
molecules (from genomic to protein) constitute a statement that is called an

<!-- '' == ONE, NO MORE NO LESS -->
<!-- '?' == ZERO OR ONE. -->
<!-- '*' == ZERO OR MORE. -->
<!-- '+' == ONE OR MORE. -->

<!-- General purpose entities and elements that are used in
     mulitiple elements -->
<!ENTITY % integer "NMTOKEN">
<!ELEMENT value (#PCDATA)>
<!ELEMENT synonym (#PCDATA)>
<!ELEMENT program (#PCDATA)>
<!ELEMENT version (#PCDATA)>
<!ELEMENT score (#PCDATA)>
<!-- ISO date format -->
<!ELEMENT creation_date (#PCDATA)>

<!-- for comments and other free text -->
<!ELEMENT description (#PCDATA)>

<!-- DNA, RNA, AA -->
<!ELEMENT residues (#PCDATA)>
<!ELEMENT alignment (#PCDATA)>
<!ELEMENT parameter (type, value)>
<!ELEMENT output (type, value)>
<!ELEMENT parent (type, value)>

<!-- these must be integers -->
<!ELEMENT offset (#PCDATA)>
<!ELEMENT length (#PCDATA)>
<!ELEMENT start (#PCDATA)>

<!-- things to describe where the sequence came from -->
<!ELEMENT species (#PCDATA)>
<!ELEMENT tissue (#PCDATA)>
<!ELEMENT stage (#PCDATA)>
<!ELEMENT project (#PCDATA)>

<!-- The entity 'site_operator' is specific to fuzzy site, start,
     and end elements -->
<!ENTITY % site_operator " site_operator (less_than | greater_than)">

<!ELEMENT fuzzy_start (span)>
           <!ATTLIST start
              %site_operator; #IMPLIED

<!ELEMENT fuzzy_end (span)>
           <!ATTLIST end
              %site_operator; #IMPLIED

<!ELEMENT fuzzy_span (fuzzy_start, fuzzy_end)>

<!-- either_dir attribute is because some features do not have
     an orientation associated with them, but apply equally well
     to either strand -->
<!-- The between attribute is used to indicate a position between 2
     bases (or more generally between 2 sites?).
     It is important to note that this flag is preferred to a
     length of zero. The problem with a 0 length span is that it
     is not possible to tell if one means before or after the
     current base -->
<!ELEMENT span (start, end)>
           <!ATTLIST span
              between    (TRUE) #IMPLIED
              either_dir (TRUE) #IMPLIED

   I've taken the locations from the descriptions in the GB Feature Table
   Definition.   For descriptions of what the feature spans mean, see:
   (David Emmert, Harvard)

- Location: 467

- Location: 340..565

- Location: <345..500
           <fuzzy_start site_operator="less_than">

- Location: (102.110)

- Location: (23.45)..600

- Location: (122.133)..(204.221)

- Location: 123^124
        <span between="TRUE">


<!-- Annotation sub-elements. -->
<!-- NAME: the official (by someone's standard) symbol to use -->
<!-- DBXREF: The database cross-reference element refers to a
database where the annotation is generated and maintained. -->
<!-- GENE: Can't seem to avoid the nefarious gene concept. There are
different relationship a gene can have to the annotation element.
One is a positive identification (or assignment) to a gene. The alternate to
this is a list of known genes (from traditional genetic analysis) any of
which are possible candidates for assigning to this annotation. Both of
these assignment elements are naturally supposed to be within the same
species. Relationships to other genes (either within or in other species) is
indicated by enclosing zero or more related gene elements. The specifics of
the type of relationship is held within the sub-element. -->
<!-- DESCRIPTION: is a comment, a free text field for the curators to jot down
any additional information. -->
<!-- FEATURE_SET: to make it possible to set this up in an analogous manner
     to computational_analysis and result_set -->
<!-- SEQ: what sequence this annotation applies to -->
<!-- Annotation attributes. -->
<!-- The id is a unique identifier for other elements to use
     when referencing this annotation. -->

<!ELEMENT annotation (name?, dbxref?, gene*, aspect*, description?, feature_set*, seq?)>
  <!ATTLIST annotation
    id        ID    #REQUIRED
    seq       IDREF #IMPLIED

<!ELEMENT aspect (dbxref, (function | process | cellular_component))>
<!ELEMENT function (#PCDATA)>
<!ELEMENT process (#PCDATA)>
<!ELEMENT cellular_component  (#PCDATA)>

<!-- Obviously there are other sorts of gene to gene relationships and these
still need to be added -->

<!ENTITY % association "association (HOMOLOG|ORTHOLOG|PARALOG|IS|MAY_BE)">

<!ELEMENT gene (dbxref, name?, synonym?, species?, description?)>
  <!ATTLIST gene
    %association;    #REQUIRED
    annotation IDREF #IMPLIED

<!-- seq sub-elements. -->
<!-- The seq element represents the different DNA, RNA, and AA molecules. -->

<!-- The database cross-reference refers to a sequence database like genbank
or embl (only for genomic and cDNAs of course). -->
<!-- A single origin/source is requested to indicate the derivation of the
primary sequence (this is basically clone information for genomic and cDNA
data). -->
<!-- The residues are always optional for any of these. -->
<!-- seq element IDs are used to support derivation between seq elements -->

<!-- Molecular element attributes. -->
<!-- Each has a unique identifier for other elements to use when referencing
this sequence molecule. It may also act as a label in displays.
Because the DNA, RNA, or AA residue elements are optional a length attribute
is required. The length provides the extent of the number line along which
the features and analysis are positioned -->
<!-- These aspects are associated with an individual sequence
     and not the annotation because a single annotation may
     describe the differnet gene products that arise from
     the same region of the genome -->

<!ENTITY % maturity "maturity (primary | processed | pro | pre-pro | pre-pro-pro )">
<!ENTITY % transcript_function "transcript_function (mRNA | rRNA | snoRNA | snRNA | tRNA | trans_spliced_leader)">
<!ENTITY % immigrant "immigrant (transposon | pseudogene | mobile_intron | virus | plasmid)">
<!-- do we really want CDS? it seems redundant --
     right-o its gone, also chucked cDNA -->
<!ENTITY % seq_type "type (AA | RNA | DNA)">

<!ELEMENT seq (name?, dbxref*, map_position*, source?, project?, clone*, description?, residues?, parent*)>
  <!ATTLIST seq
    id           ID        #REQUIRED
    %seq_type;             #REQUIRED
    produced_by  IDREF     #IMPLIED
    length       %integer; #IMPLIED
    %maturity;             #IMPLIED
    %transcript_function;  #IMPLIED
    %immigrant;            #IMPLIED

 <!ELEMENT seq_relationship ((span | fuzzy_span), alignment?)>
  <!ATTLIST seq_relationship
    type (query | subject | peer | subseq) #IMPLIED

<!ENTITY % map_type "type (cytological | linear | ordering)">
<!ELEMENT map_position (map, span?)>
  <!ATTLIST map_position
    %map_type;             #REQUIRED

<!-- an example of a mapping
<map_position type=cytological>

<!-- ordering of the 'exons' is implied the ordering of the
     features in this set -->
<!-- VERSION: as the annotation progresses versions are maintained -->
<!-- AUTHOR: who/what created this annotation -->
<!-- DATE: date this annotation was first created -->
<!ELEMENT feature_set (name?, type?, seq_relationship*, author?, creation_date?, version?, evidence*, parent*, description?, feature_span*, seq?)>
  <!ATTLIST  feature_set
    id           ID         #REQUIRED
    annotation   IDREF      #IMPLIED
    produces_seq IDREF      #IMPLIED

<!-- A 'feature' is defined by 3 things: a type,
     an interval (start and end) to place it on the molecule in question,
     and the results that support this designation. -->
<!ELEMENT feature_span (type?, seq_relationship*, evidence*, tag_residues?)>

<!ELEMENT computational_analysis (type?, database?, program, date?, version?,
                                  parameter*, result_set*)>
  <!ATTLIST computational_analysis
    seq       IDREF   #IMPLIED

<!ELEMENT result_set (score?, seq_relationship*, dbxref?, output*, result_span*, parent*)>

<!ELEMENT result_span (score?, type?, seq_relationship+, output*)>
  <!ATTLIST result_span
    id         ID        #IMPLIED

<!ELEMENT tag_residues (residues)>
  <!ATTLIST tag_residues
    offset %integer; #REQUIRED

<!ELEMENT evidence (dbxref?, description?)>
  <!ATTLIST evidence
    type     CDATA #IMPLIED
    result   IDREF #IMPLIED
     the element that the evidence result attribut refers to may either
     be a seq, a seq, a computational_analysis or a result_span
     depending upon whether or not the computed results are actually
     available within the xml document (a result_span). If not directly
     provide this provides a mechanism to indicate how those results
     can be regenerated. A seq element id means that there is an alignment.
     A computational_analysis indicates what program to run and how.
     A result_span element means that the program has already been
     run and the results are availabe within the current dtd.

<!ELEMENT experimental_analysis (experimental_conclusion+, citation, description?)>
  <!ATTLIST experimental_analysis
    seq       IDREF   #REQUIRED

<!ELEMENT experimental_conclusion (#PCDATA)>
  <!ATTLIST experimental_conclusion
    id        ID    #IMPLIED

<!ELEMENT database (name, date?, version?)>

<!ELEMENT dbxref (xref_db, xref_db_id?)>
<!ELEMENT xref_db (#PCDATA)>
<!ELEMENT xref_db_id (#PCDATA)>

<!-- Everything below this point is very sketchy, so don't
     jump to any conclusions from what follows -->

<!-- Species is mandatory, it is the origin of the sequence -->
<!-- project is who generated this sequence (but not necessarily the
associated features and analyses. -->

<!ELEMENT source (species?, tissue?, stage?)>

<!-- A database cross reference for the clone itself
Text content for say, ordering information? -->

<!ELEMENT clone (dbxref+, span?, description?)>

<!-- use the dublin core here?? -->
<!ENTITY % pub_type "type (Journal | Personal_communication | Proceedings | Book)">
<!ELEMENT citation (dbxref?, title, journal?, date, author*, volume?, pages?)>
  <!ATTLIST citation
    %pub_type;          #REQUIRED

<!ELEMENT title (#PCDATA)>
<!ELEMENT author (#PCDATA)>
<!ELEMENT volume (#PCDATA)>
<!ELEMENT pages (#PCDATA)>
<!ELEMENT journal (#PCDATA)>