-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathschema.xml
101 lines (76 loc) · 4.28 KB
/
schema.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
<?xml version="1.0" ?>
<schema name="documents" version="1.1">
<!--
Fields
-->
<field name="id" type="string" indexed="true" stored="true" multiValued="false" />
<field name="parent_id" type="string" indexed="true" stored="true" multiValued="false" />
<!-- Document level (parent) Fields -->
<field name="title" type="text" indexed="true" stored="true" multiValued="false" />
<field name="path" type="string" indexed="false" stored="true" multiValued="false" />
<field name="content_type" type="string" indexed="true" stored="true" multiValued="false" />
<field name="file_binary_base64" type="binary" indexed="false" stored="true" multiValued="false" />
<field name="file_binary_base64_str" type="string" indexed="false" stored="true" multiValued="false" />
<!-- Page Level (child) Fields -->
<field name="content" type="text" indexed="true" stored="true" multiValued="false" />
<field name="content_ocr" type="ocr" indexed="true" stored="true" multiValued="false" termPayloads="true" termOffsets="true" termPositions="true" termVectors="true" />
<field name="content_extracted" type="text" indexed="true" stored="true" multiValued="false" />
<field name="page_number" type="int" indexed="true" stored="true" multiValued="false" />
<field name="page_dimension" type="string" indexed="false" stored="true" multiValued="true" />
<!--
Copy fields
-->
<!--
Internal schema configuration settings
-->
<!-- Specifically _root_ is used by Parent/Child indexing -->
<field name="_root_" type="string" indexed="true" stored="false" docValues="false" />
<!-- SOLR-12519 enforces either use _nest_path_ or parentFilter but not both
<fieldType name="_nest_path_" class="solr.NestPathField" />
<field name="_nest_path_" type="_nest_path_"/>
-->
<!-- Used by Solr for optimistic concurrency -->
<field name="_version_" type="long" indexed="true" stored="true"/>
<dynamicField name="ignored_*" type="ignored" multiValued="true"/>
<dynamicField name="attr_*" type="text" indexed="true" stored="true" multiValued="true"/>
<!-- field to use to determine and enforce document uniqueness. -->
<uniqueKey>id</uniqueKey>
<!--
Field Types
-->
<!-- since fields of this type are by default not stored or indexed,
any data added to them will be ignored outright. -->
<fieldType name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />
<!-- OCR Payloads -->
<fieldType name="ocr" stored="false" indexed="true" class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="com.o19s.payloads.Base64Encoder" />
<!-- The payload buffer works around WDF removing payloads from tokens by copying and replacing later in the chain -->
<filter class="com.o19s.payloads.filter.PayloadBufferFilterFactory" />
<filter class="solr.LowerCaseFilterFactory" />
<filter class="solr.WordDelimiterFilterFactory" />
<filter class="solr.KStemFilterFactory" />
<filter class="com.o19s.payloads.filter.PayloadBufferFilterFactory" />
</analyzer>
</fieldType>
<!-- Simple text analysis -->
<fieldType name="text" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KStemFilterFactory" />
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KStemFilterFactory" />
</analyzer>
</fieldType>
<!-- Simple types -->
<fieldtype name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
<fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/>
<fieldType name="int" class="solr.TrieIntField" precisionStep="8" positionIncrementGap="0"/>
<fieldType name="date" class="solr.TrieDateField" precisionStep="6" positionIncrementGap="0"/>
<fieldtype name="binary" class="solr.BinaryField" />
</schema>