Skip to content

Commit

Permalink
Make tika extension work as per the vanilla Camel component
Browse files Browse the repository at this point in the history
Fixes #5234
Fixes #5393
  • Loading branch information
jamesnetherton committed Jan 31, 2025
1 parent 7742843 commit 16abe1b
Show file tree
Hide file tree
Showing 21 changed files with 50 additions and 363 deletions.
34 changes: 0 additions & 34 deletions docs/modules/ROOT/pages/reference/extensions/tika.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -44,37 +44,3 @@ Or add the coordinates to your existing project:
ifeval::[{doc-show-user-guide-link} == true]
Check the xref:user-guide/index.adoc[User guide] for more information about writing Camel Quarkus applications.
endif::[]

[id="extensions-tika-camel-quarkus-limitations"]
== Camel Quarkus limitations

Parameters `tikaConfig` and `tikaConfigUri` are not available in quarkus camel tika extension. Configuration
can be changed only via `application.properties`.

While you can use any of the available https://tika.apache.org/1.24.1/formats.html[Tika parsers] in JVM mode,
only some of those are supported in native mode - see the https://quarkiverse.github.io/quarkiverse-docs/quarkus-tika/dev/index.html[Quarkus Tika guide].

PDF and ODF parsers can not be used both in JVM mode or in the native mode. Pdf extension is suggested for purposes of pdf consumption to avoid a version conflict between Camel and Quarkus-tika extension involving PdfBox dependency.

Use of the Tika parser without any configuration will initialize all available parsers. Unfortunately as some of them
don't work in the native mode, the whole execution will fail.

In order to make the Tika parser work in the native mode, selection of parsers for initialization should be used.

* `quarkus.tika.parsers` Comma separated list of parsers (abbreviations). There are two predefined parsers:
`pdf` and `odf`.
* `quarkus.tika.parser.*` Adds new parser abbreviation to be used with previous property. Value is the full class of
the parser.

Example of `application.properties`:
[source,properties]
----
quarkus.tika.parsers = pdf,odf,office
quarkus.tika.parser.office = org.apache.tika.parser.microsoft.OfficeParser
----

For more information about selecting parsers see the https://quarkiverse.github.io/quarkiverse-docs/quarkus-tika/dev/index.html[Quarkus Tika guide].

You may need to add the `quarkus-awt` extension to build the native image. For more information, see https://quarkiverse.github.io/quarkiverse-docs/quarkus-tika/dev/index.html[Quarkus Tika guide].


12 changes: 8 additions & 4 deletions extensions/tika/deployment/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,14 @@
<name>Camel Quarkus :: Tika :: Deployment</name>

<dependencies>
<dependency>
<groupId>io.quarkus</groupId>
<artifactId>quarkus-netty-deployment</artifactId>
</dependency>
<dependency>
<groupId>io.quarkus</groupId>
<artifactId>quarkus-awt-deployment</artifactId>
</dependency>
<dependency>
<groupId>org.apache.camel.quarkus</groupId>
<artifactId>camel-quarkus-core-deployment</artifactId>
Expand All @@ -38,10 +46,6 @@
<groupId>org.apache.camel.quarkus</groupId>
<artifactId>camel-quarkus-tika</artifactId>
</dependency>
<dependency>
<groupId>io.quarkiverse.tika</groupId>
<artifactId>quarkus-tika-deployment</artifactId>
</dependency>
<dependency>
<groupId>org.apache.camel.quarkus</groupId>
<artifactId>camel-quarkus-support-xalan-deployment</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,49 +16,40 @@
*/
package org.apache.camel.quarkus.component.tika.deployment;

import io.quarkus.arc.deployment.BeanContainerBuildItem;
import java.util.Set;

import io.quarkus.deployment.annotations.BuildProducer;
import io.quarkus.deployment.annotations.BuildStep;
import io.quarkus.deployment.annotations.ExecutionTime;
import io.quarkus.deployment.annotations.Record;
import io.quarkus.deployment.builditem.FeatureBuildItem;
import io.quarkus.deployment.builditem.nativeimage.RuntimeInitializedClassBuildItem;
import org.apache.camel.component.tika.TikaComponent;
import org.apache.camel.quarkus.component.tika.TikaRecorder;
import org.apache.camel.quarkus.core.deployment.spi.CamelRuntimeBeanBuildItem;
import org.apache.camel.quarkus.core.deployment.spi.CamelServiceFilter;
import org.apache.camel.quarkus.core.deployment.spi.CamelServiceFilterBuildItem;
import org.jboss.logging.Logger;
import io.quarkus.deployment.builditem.nativeimage.NativeImageResourceBuildItem;
import io.quarkus.deployment.builditem.nativeimage.ServiceProviderBuildItem;
import io.quarkus.deployment.util.ServiceUtil;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.parser.Parser;

class TikaProcessor {

private static final Logger LOG = Logger.getLogger(TikaProcessor.class);
private static final String FEATURE = "camel-tika";

@BuildStep
FeatureBuildItem feature() {
return new FeatureBuildItem(FEATURE);
}

/*
* The tika component is programmatically configured by the extension thus
* we can safely prevent camel to instantiate a default instance.
*/
@BuildStep
CamelServiceFilterBuildItem serviceFilter() {
return new CamelServiceFilterBuildItem(CamelServiceFilter.forComponent("tika"));
void registerTikaCoreResources(BuildProducer<NativeImageResourceBuildItem> resource) {
resource.produce(new NativeImageResourceBuildItem("org/apache/tika/mime/tika-mimetypes.xml"));
resource.produce(new NativeImageResourceBuildItem("org/apache/tika/parser/external/tika-external-parsers.xml"));
}

@Record(ExecutionTime.STATIC_INIT)
@BuildStep
CamelRuntimeBeanBuildItem tikaComponent(BeanContainerBuildItem beanContainer, TikaRecorder recorder) {
return new CamelRuntimeBeanBuildItem(
"tika",
TikaComponent.class.getName(),
recorder.createTikaComponent(beanContainer.getValue()));
void registerTikaServices(BuildProducer<ServiceProviderBuildItem> serviceProvider) throws Exception {
serviceProvider.produce(new ServiceProviderBuildItem(EncodingDetector.class.getName(),
getProviderNames(EncodingDetector.class.getName())));
serviceProvider.produce(new ServiceProviderBuildItem(Parser.class.getName(), getProviderNames(Parser.class.getName())));
}

@BuildStep
RuntimeInitializedClassBuildItem runtimeInitializedClasses() {
return new RuntimeInitializedClassBuildItem("org.apache.pdfbox.text.LegacyPDFStreamEngine");
private Set<String> getProviderNames(String serviceProviderName) throws Exception {
return ServiceUtil.classNamesNamedIn(Thread.currentThread().getContextClassLoader(),
"META-INF/services/" + serviceProviderName);
}
}
12 changes: 8 additions & 4 deletions extensions/tika/runtime/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,14 @@
</properties>

<dependencies>
<dependency>
<groupId>io.quarkus</groupId>
<artifactId>quarkus-netty</artifactId>
</dependency>
<dependency>
<groupId>io.quarkus</groupId>
<artifactId>quarkus-awt</artifactId>
</dependency>
<dependency>
<groupId>org.apache.camel.quarkus</groupId>
<artifactId>camel-quarkus-core</artifactId>
Expand All @@ -48,10 +56,6 @@
<groupId>org.apache.camel.quarkus</groupId>
<artifactId>camel-quarkus-support-xalan</artifactId>
</dependency>
<dependency>
<groupId>io.quarkiverse.tika</groupId>
<artifactId>quarkus-tika</artifactId>
</dependency>
</dependencies>

<build>
Expand Down
29 changes: 0 additions & 29 deletions extensions/tika/runtime/src/main/doc/limitations.adoc

This file was deleted.

This file was deleted.

6 changes: 3 additions & 3 deletions integration-tests/tika/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.apache.camel.quarkus</groupId>
Expand Down Expand Up @@ -53,7 +54,7 @@
<dependency>
<groupId>io.quarkus</groupId>
<artifactId>quarkus-junit5</artifactId>
<scope>test</scope>
<scope>test</scope>
<exclusions>
<exclusion>
<groupId>org.jsoup</groupId>
Expand All @@ -68,7 +69,6 @@
</dependency>
</dependencies>


<profiles>
<profile>
<id>native</id>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,12 @@
import jakarta.ws.rs.Produces;
import jakarta.ws.rs.core.MediaType;
import jakarta.ws.rs.core.Response;
import org.apache.camel.Exchange;
import org.apache.camel.ProducerTemplate;
import org.jboss.logging.Logger;

@Path("/tika")
@ApplicationScoped
public class TikaResource {

private static final Logger LOG = Logger.getLogger(TikaResource.class);

@Inject
ProducerTemplate producerTemplate;

Expand All @@ -43,23 +40,11 @@ public class TikaResource {
@Consumes(MediaType.APPLICATION_OCTET_STREAM)
@Produces(MediaType.TEXT_PLAIN)
public Response parse(byte[] message) throws Exception {
final String response = producerTemplate.requestBody("tika:parse", message, String.class);
return Response
.created(new URI("https://camel.apache.org/"))
.entity(response)
.build();
}

@Path("/parseAsText")
@POST
@Consumes(MediaType.APPLICATION_OCTET_STREAM)
@Produces(MediaType.TEXT_PLAIN)
public Response parseAsTxt(byte[] message) throws Exception {
final String response = producerTemplate.requestBody("tika:parse?tikaParseOutputFormat=text", message,
String.class);
final Exchange response = producerTemplate.request("tika:parse", exchange -> exchange.getMessage().setBody(message));
return Response
.created(new URI("https://camel.apache.org/"))
.entity(response)
.header("Parsed-Content-Type", response.getMessage().getHeader(Exchange.CONTENT_TYPE))
.entity(response.getMessage().getBody(String.class))
.build();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,4 @@
## limitations under the License.
## ---------------------------------------------------------------------------

#quarkus.tika.parsers= pdf,odf,office,xml,image //Requires new release of quarkiverse-tike, which adopts tika with pdfBox 3.x
quarkus.tika.parsers= odf,office,xml,image
quarkus.tika.parser.office = org.apache.tika.parser.microsoft.OfficeParser
quarkus.tika.parser.image = org.apache.tika.parser.image.ImageParser
quarkus.tika.parser.xml = org.apache.tika.parser.xml.DcXMLParser
quarkus.native.resources.includes=assets/*
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,8 @@
package org.apache.camel.quarkus.component.tika.it;

import io.quarkus.test.junit.QuarkusIntegrationTest;
import org.junit.jupiter.api.condition.DisabledOnOs;
import org.junit.jupiter.api.condition.OS;

@QuarkusIntegrationTest
//https://github.com/apache/camel-quarkus/issues/3417
@DisabledOnOs(OS.MAC)
class TikaIT extends TikaTest {

}
Loading

0 comments on commit 16abe1b

Please sign in to comment.