On the user list, Christian Leitinger noted that his team found a potential issue with the thread safety of the encoding detector. I was able to reproduce this with on the corpus of html files in faghani's encoding detector.
@Test public void testMultiThreadingEncodingDetection() throws Exception { Path testDocs = Paths.get("C:/data/encodings/corpus"); List<Path> paths = new ArrayList<>(); Map<Path, String> encodings = new ConcurrentHashMap<>(); for (File encodingDirs : testDocs.toFile().listFiles()) { for (File file : encodingDirs.listFiles()) { String encoding = getEncoding(file.toPath()); paths.add(file.toPath()); encodings.put(file.toPath(), encoding); } } int numThreads = 1000; ExecutorService ex = Executors.newFixedThreadPool(numThreads); CompletionService<String> completionService = new ExecutorCompletionService<>(ex); for (int i = 0; i < numThreads; i++) { completionService.submit(new EncodingDetectorRunner(paths, encodings), "done"); } int completed = 0; while (completed < numThreads) { Future<String> future = completionService.take(); if (future.isDone() && "done".equals(future.get())) { completed++; } } assertTrue("success!", true); } private class EncodingDetectorRunner implements Runnable { private final List<Path> paths; private final Map<Path, String> encodings; private final Random r = new Random(); private EncodingDetectorRunner(List<Path> paths, Map<Path, String> encodings) { this.paths = paths; this.encodings = encodings; } @Override public void run() { for (int i = 0; i < 100; i++) { int pInd = r.nextInt(paths.size()); String detectedEncoding = null; try { detectedEncoding = getEncoding(paths.get(pInd)); } catch (Exception e) { throw new RuntimeException(e); } String trueEncoding = encodings.get(paths.get(pInd)); if (! detectedEncoding.equals(trueEncoding)) { throw new RuntimeException("detected: " + detectedEncoding + " but should have been: "+trueEncoding + " for " + paths.get(pInd)); } } } } public String getEncoding(Path p) throws Exception { try (InputStream is = TikaInputStream.get(p)) { AutoDetectReader reader = new AutoDetectReader(is); String val = reader.getCharset().toString(); if (val == null) { return "NULL"; } else { return val; } } }
ava.util.concurrent.ExecutionException: java.lang.RuntimeException: detected: ISO-8859-1 but should have been: windows-1252 for C:\data\encodings\corpus\Shift_JIS\1 at at java.util.concurrent.FutureTask.get( at org.apache.tika.parser.html.HtmlParserTest.testMultiThreadingEncodingDetection(