forked from ga4gh/ga4gh-schemas
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsequenceAnnotations.avdl
167 lines (141 loc) · 4.29 KB
/
sequenceAnnotations.avdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
@namespace("org.ga4gh.models")
/**
This protocol defines annotations on GA4GH genomic sequences It includes two
types of annotations: continuous and discrete hierarchical.
The discrete hierarchical annotations are derived from the Sequence Ontology
(SO) and GFF3 work
http://www.sequenceontology.org/gff3.shtml
The goal is to be able to store annotations using the GFF3 and SO conceptual
model, although there is not necessarly a one-to-one mapping in Avro records
to GFF3 records.
The minimum requirement is to be able to accurately represent the current
state of the art annotation data and the full SO model. Feature is the
core generic record which corresponds to the a GFF3 record.
*/
protocol SequenceAnnotations {
import idl "common.avdl";
import idl "ontologies.avdl";
/**
Identifier from a public database
*/
record ExternalIdentifier {
/**
The source of the identifier.
(e.g. `Ensembl`)
*/
string database;
/**
The ID defined by the external database.
(e.g. `ENST00000000000`)
*/
string identifier;
/**
The version of the object or the database
(e.g. `78`)
*/
string version;
}
/**
Type defining a collection of attributes associated with various protocol
records. Each attribute is a name that maps to an array of one or more
values. Values can be strings, external identifiers, or ontology terms.
Values should be split into the array elements instead of using a separator
syntax that needs to parsed.
*/
record Attributes {
map<array<union {string, ExternalIdentifier, OntologyTerm}>> vals = {};
}
/**
Continuous numerical annotation along a path.
*/
record Wiggle {
/**
The path for the region being annotated.
Note that we are fusing parts of UCSC BedGraph and Wiggle syntax.
The region is being fully annotated, but can be divided into bins.
If you have gaps, you need to define a sequence of such Wiggles.
*/
Path path;
/**
The values associated to this region.
If this list contains _count_ elements, then the region is divided
as cleanly as possible into _count_ bins of equal width.
We thus define a numerical function:
value(position):
if position < start or position >= start + length:
return None
else:
return array[floor((position - start)*count/length)]
*/
array<float> values = [];
}
/*
A set of wiggle annotations
*/
record WiggleSet {
/** Id of this annotation node. */
string id;
/** Set of additional attributes */
Attributes attributes;
}
/**
Node in the annotation graph that annotates a contiguous region of a
sequence.
*/
record Feature {
/**
Id of this annotation node.
*/
string id;
/**
Ids of the parents of this annotation node.
*/
array<string> parentIds;
/**
Identifier for the containing feature set.
*/
string featureSetId;
/**
Genomic location.
*/
Path path;
/**
Feature that is annotated by this region. Normally, this will be a term in
the Sequence Ontology.
*/
OntologyTerm featureType;
/**
Name/value attributes of the annotation. Attribute names follow the GFF3
naming convention of reserved names starting with an upper cases
character, and user-define names start with lower-case. Most GFF3
pre-defined attributes apply, the exceptions are ID and Parent, which are
defined as fields. Additional, the following attributes are added:
* Score - the GFF3 score column
* Phase - the GFF3 phase column for CDS features.
*/
Attributes attributes;
}
/*
A set of region annotations
*/
record FeatureSet {
/** The ID of this annotation set. */
string id;
/** The ID of the dataset this annotation set belongs to. */
union { null, string } datasetId = null;
/**
The ID of the reference set which defines the coordinate-space for this
set of annotations.
*/
union { null, string } referenceSetId;
/** The display name for this annotation set. */
union { null, string } name = null;
/**
The source URI describing the file from which this annotation set was
generated, if any.
*/
union { null, string } sourceURI = null;
/** Set of additional attributes */
Attributes attributes;
}
}