From 317755ba612b203e1739cb5c0190fc753d613df5 Mon Sep 17 00:00:00 2001 From: Chris Fraire Date: Wed, 15 Apr 2020 22:21:01 -0500 Subject: [PATCH 01/15] Fix to cut extra elements from Java 11 serialization --- .../opengrok/indexer/configuration/ConfigurationHelp.java | 5 +++++ .../indexer/configuration/ConfigurationHelpTest.java | 8 ++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/configuration/ConfigurationHelp.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/configuration/ConfigurationHelp.java index 81ea875c018..8f6be420edf 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/configuration/ConfigurationHelp.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/configuration/ConfigurationHelp.java @@ -92,10 +92,15 @@ public static String getSamples() throws RuntimeException { mthd); } + String propertyName = mthd.getName().replaceFirst("^set", ""); + sample = conf.getXMLRepresentationAsString(); sample = sample.replaceFirst( "(?sx)^<\\?xml.*Configuration\\d*\">\\n", ""); sample = sample.replaceFirst("\\n", ""); + // With Java 11 the following excision is necessary. + sample = sample.replaceFirst("(?isx)^.*\\n(?=\\s*. + * Copyright (c) 2018, 2020, Chris Fraire . */ package org.opengrok.indexer.configuration; +import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import org.junit.Test; @@ -33,9 +34,12 @@ public class ConfigurationHelpTest { @Test public void shouldCreateReadableUsage() { String samples = ConfigurationHelp.getSamples(); - assertTrue("samples are not empty", !samples.isEmpty()); + assertFalse("samples are not empty", samples.isEmpty()); assertTrue("samples contains \"\n" + + " \n")); } } From 4766031c9cb73c47940ee26cafb49edd13fc45c0 Mon Sep 17 00:00:00 2001 From: Chris Fraire Date: Thu, 16 Apr 2020 19:48:41 -0500 Subject: [PATCH 02/15] Relocate as static final --- .../java/org/opengrok/indexer/analysis/FileAnalyzer.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/FileAnalyzer.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/FileAnalyzer.java index 7b558656a7a..ccead09f4b6 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/FileAnalyzer.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/FileAnalyzer.java @@ -59,6 +59,7 @@ public class FileAnalyzer extends AbstractAnalyzer { private static final Logger LOGGER = LoggerFactory.getLogger(FileAnalyzer.class); + private static final String ANALYZER_LC = "analyzer"; /** * @return {@code null} as there is no aligned language @@ -134,10 +135,9 @@ protected FileAnalyzer(AnalyzerFactory factory, @Override public String getFileTypeName() { String name = this.getClass().getSimpleName().toLowerCase(Locale.ROOT); - String suffix = "analyzer"; - if (name.endsWith(suffix)) { - return name.substring(0, name.length() - suffix.length()); + if (name.endsWith(ANALYZER_LC)) { + return name.substring(0, name.length() - ANALYZER_LC.length()); } return name; From 5cdfd57fde48ab6f6cb93835d89aca9bd82578df Mon Sep 17 00:00:00 2001 From: Chris Fraire Date: Thu, 16 Apr 2020 19:46:44 -0500 Subject: [PATCH 03/15] Store QueryBuilder.T for every Genre --- .../main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java index 00725f4685a..881aa2c6520 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java @@ -626,7 +626,7 @@ public void populateDocument(Document doc, File file, String path, if (fa != null) { AbstractAnalyzer.Genre g = fa.getGenre(); - if (g == AbstractAnalyzer.Genre.PLAIN || g == AbstractAnalyzer.Genre.XREFABLE || g == AbstractAnalyzer.Genre.HTML) { + if (g != null) { doc.add(new Field(QueryBuilder.T, g.typeName(), string_ft_stored_nanalyzed_norms)); } fa.analyze(doc, StreamSource.fromFile(file), xrefOut); From b5b252e0cc5c1360c4bb677fd6886bde2c5d5bb1 Mon Sep 17 00:00:00 2001 From: Chris Fraire Date: Thu, 16 Apr 2020 19:57:50 -0500 Subject: [PATCH 04/15] AnalyzerGuru is actually (unfortunately) a singleton at the moment --- .../org/opengrok/indexer/analysis/AnalyzerGuru.java | 11 +++++++---- .../org/opengrok/indexer/index/IndexDatabase.java | 6 ++---- .../indexer/analysis/LuceneCompatibilityTest.java | 4 +--- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java index 881aa2c6520..868fdee17a8 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java @@ -389,7 +389,7 @@ public static Map getfileTypeDescriptions() { return Collections.unmodifiableMap(fileTypeDescriptions); } - public List getAnalyzerFactories() { + public static List getAnalyzerFactories() { return Collections.unmodifiableList(factories); } @@ -575,9 +575,8 @@ public static void returnAnalyzers() { * @throws IOException If an exception occurs while collecting the data * @throws InterruptedException if a timeout occurs */ - public void populateDocument(Document doc, File file, String path, - AbstractAnalyzer fa, Writer xrefOut) throws IOException, - InterruptedException { + public static void populateDocument(Document doc, File file, String path, + AbstractAnalyzer fa, Writer xrefOut) throws IOException, InterruptedException { String date = DateTools.timeToString(file.lastModified(), DateTools.Resolution.MILLISECOND); @@ -1190,4 +1189,8 @@ private static boolean factoriesDifferent(AnalyzerFactory a, } return a_name == null || !a_name.equals(b_name); } + + /* private to enforce static */ + private AnalyzerGuru() { + } } diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexDatabase.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexDatabase.java index 948f527fa2d..872955ee976 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexDatabase.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexDatabase.java @@ -129,7 +129,7 @@ public class IndexDatabase { private final Map indexedSymlinks = new TreeMap<>( Comparator.comparingInt(String::length).thenComparing(o -> o)); - private Project project; + private final Project project; private FSDirectory indexDirectory; private IndexReader reader; private IndexWriter writer; @@ -138,7 +138,6 @@ public class IndexDatabase { private TermsEnum uidIter; private PostingsEnum postsIter; private PathAccepter pathAccepter; - private AnalyzerGuru analyzerGuru; private File xrefDir; private boolean interrupted; private CopyOnWriteArrayList listeners; @@ -307,7 +306,6 @@ private void initialize() throws IOException { lockfact = pickLockFactory(env); indexDirectory = FSDirectory.open(indexDir.toPath(), lockfact); pathAccepter = env.getPathAccepter(); - analyzerGuru = new AnalyzerGuru(); xrefDir = new File(env.getDataRootFile(), XREF_DIR); listeners = new CopyOnWriteArrayList<>(); dirtyFile = new File(indexDir, "dirty"); @@ -726,7 +724,7 @@ private void addFile(File file, String path, Ctags ctags) Document doc = new Document(); try (Writer xrefOut = newXrefWriter(fa, path)) { - analyzerGuru.populateDocument(doc, file, path, fa, xrefOut); + AnalyzerGuru.populateDocument(doc, file, path, fa, xrefOut); } catch (InterruptedException e) { LOGGER.log(Level.WARNING, "File ''{0}'' interrupted--{1}", new Object[]{path, e.getMessage()}); diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/LuceneCompatibilityTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/LuceneCompatibilityTest.java index 074fbfaf74c..aab2ae95ba4 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/LuceneCompatibilityTest.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/LuceneCompatibilityTest.java @@ -72,7 +72,6 @@ public static Test suite() { } } Analyzer testA; - AnalyzerGuru guru; Method testM; Object testC = null; @@ -81,7 +80,6 @@ public static Test suite() { */ @Override protected void setUp() throws Exception { - guru = new AnalyzerGuru(); Class c = Class.forName(LUCENE_TEST_CLASS); //testC = c.newInstance(); //this is static call Class[] argTypes = {TokenStream.class, String[].class, int[].class, int[].class, String[].class, int[].class, int[].class, Integer.class, boolean.class}; @@ -89,7 +87,7 @@ protected void setUp() throws Exception { } public void testCompatibility() throws Exception { - for (AnalyzerFactory fa : guru.getAnalyzerFactories()) { + for (AnalyzerFactory fa : AnalyzerGuru.getAnalyzerFactories()) { String input = "Hello world"; String[] output = new String[]{"Hello", "world"}; testA = fa.getAnalyzer(); From 1b26df7e1c299bf940b4cf40cc121a211ea81f9a Mon Sep 17 00:00:00 2001 From: Chris Fraire Date: Fri, 17 Apr 2020 19:37:27 -0500 Subject: [PATCH 05/15] Delete unused fileUpdate() --- .../indexer/index/DefaultIndexChangedListener.java | 6 +----- .../opengrok/indexer/index/IndexChangedListener.java | 6 +----- .../java/org/opengrok/indexer/index/IndexerTest.java | 10 +--------- 3 files changed, 3 insertions(+), 19 deletions(-) diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/index/DefaultIndexChangedListener.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/index/DefaultIndexChangedListener.java index d7cd2fcfbf6..b4df2ec0772 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/index/DefaultIndexChangedListener.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/index/DefaultIndexChangedListener.java @@ -19,7 +19,7 @@ /* * Copyright (c) 2008, 2018, Oracle and/or its affiliates. All rights reserved. - * Portions Copyright (c) 2018, Chris Fraire . + * Portions Copyright (c) 2018, 2020, Chris Fraire . */ package org.opengrok.indexer.index; @@ -50,10 +50,6 @@ public void fileAdd(String path, String analyzer) { public void fileRemove(String path) { LOGGER.log(Level.FINE, "Remove file:{0}", path); } - @Override - public void fileUpdate(String path) { - LOGGER.log(Level.FINE, "Update: {0}", path); - } @Override public void fileAdded(String path, String analyzer) { diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexChangedListener.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexChangedListener.java index d7fa0921053..c0700775104 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexChangedListener.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexChangedListener.java @@ -19,6 +19,7 @@ /* * Copyright (c) 2008, 2018, Oracle and/or its affiliates. All rights reserved. + * Portions Copyright (c) 2020, Chris Fraire . */ package org.opengrok.indexer.index; @@ -50,9 +51,4 @@ public interface IndexChangedListener { * @param path The path to the file (absolute from source root) */ void fileRemoved(String path); - /** - * A file is to be updated in the index database. - * @param path The path to the file (absolute from source root) - */ - void fileUpdate(String path); } diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/index/IndexerTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/index/IndexerTest.java index f01bc63a9d4..4f212504bbe 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/index/IndexerTest.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/index/IndexerTest.java @@ -19,7 +19,7 @@ /* * Copyright (c) 2008, 2018, Oracle and/or its affiliates. All rights reserved. - * Portions Copyright (c) 2017-2019, Chris Fraire . + * Portions Copyright (c) 2017-2020, Chris Fraire . * Portions Copyright (c) 2020, Ric Harris . */ package org.opengrok.indexer.index; @@ -202,10 +202,6 @@ public void fileAdded(String path, String analyzer) { public void fileRemove(String path) { } - @Override - public void fileUpdate(String path) { - } - @Override public void fileRemoved(String path) { removedFiles.add(path); @@ -281,10 +277,6 @@ public void fileAdded(String path, String analyzer) { public void fileRemove(String path) { } - @Override - public void fileUpdate(String path) { - } - @Override public void fileRemoved(String path) { // The test for the file existence needs to be performed here From 752a7ce28c2dcb327248218b259a0069cf785c93 Mon Sep 17 00:00:00 2001 From: Chris Fraire Date: Wed, 15 Apr 2020 21:41:50 -0500 Subject: [PATCH 06/15] Fix #534 Fix #1646 Fix #3097 : constrain huge text files --- .../indexer/analysis/AnalyzerGuru.java | 21 +- .../analysis/data/HugeTextAnalyzer.java | 92 +++++++++ .../data/HugeTextAnalyzerFactory.java | 54 +++++ .../indexer/configuration/Configuration.java | 38 ++++ .../configuration/RuntimeEnvironment.java | 32 +++ .../opengrok/indexer/index/IndexDatabase.java | 37 +++- .../org/opengrok/indexer/index/Indexer.java | 38 +++- .../opengrok/indexer/util/LimitedReader.java | 86 ++++++++ .../opengrok/indexer/web/SearchHelper.java | 44 ++++- .../opengrok/indexer/web/SingleResult.java | 47 +++++ .../opengrok/indexer/index/HugeTextTest.java | 185 ++++++++++++++++++ .../indexer/util/LimitedReaderTest.java | 72 +++++++ .../java/org/opengrok/web/PageConfig.java | 24 +++ opengrok-web/src/main/webapp/history.jsp | 7 +- opengrok-web/src/main/webapp/list.jsp | 46 +++-- opengrok-web/src/main/webapp/more.jsp | 7 +- opengrok-web/src/main/webapp/search.jsp | 9 +- 17 files changed, 791 insertions(+), 48 deletions(-) create mode 100644 opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/data/HugeTextAnalyzer.java create mode 100644 opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/data/HugeTextAnalyzerFactory.java create mode 100644 opengrok-indexer/src/main/java/org/opengrok/indexer/util/LimitedReader.java create mode 100644 opengrok-indexer/src/main/java/org/opengrok/indexer/web/SingleResult.java create mode 100644 opengrok-indexer/src/test/java/org/opengrok/indexer/index/HugeTextTest.java create mode 100644 opengrok-indexer/src/test/java/org/opengrok/indexer/util/LimitedReaderTest.java diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java index 868fdee17a8..bfdadc77b34 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java @@ -67,6 +67,7 @@ import org.opengrok.indexer.analysis.c.CxxAnalyzerFactory; import org.opengrok.indexer.analysis.clojure.ClojureAnalyzerFactory; import org.opengrok.indexer.analysis.csharp.CSharpAnalyzerFactory; +import org.opengrok.indexer.analysis.data.HugeTextAnalyzerFactory; import org.opengrok.indexer.analysis.data.IgnorantAnalyzerFactory; import org.opengrok.indexer.analysis.data.ImageAnalyzerFactory; import org.opengrok.indexer.analysis.document.MandocAnalyzerFactory; @@ -244,6 +245,8 @@ public class AnalyzerGuru { private static final LangTreeMap langMap = new LangTreeMap(); private static final LangTreeMap defaultLangMap = new LangTreeMap(); + private static String hugeTextFileTypeName; + /* * If you write your own analyzer please register it here. The order is * important for any factory that uses a FileAnalyzerFactory.Matcher @@ -303,7 +306,8 @@ public class AnalyzerGuru { new AsmAnalyzerFactory(), new HCLAnalyzerFactory(), new TerraformAnalyzerFactory(), - new RAnalyzerFactory() + new RAnalyzerFactory(), + HugeTextAnalyzerFactory.DEFAULT_INSTANCE }; for (AnalyzerFactory analyzer : analyzers) { @@ -393,6 +397,21 @@ public static List getAnalyzerFactories() { return Collections.unmodifiableList(factories); } + /** + * Gets the normalized name of the + * {@link org.opengrok.indexer.analysis.data.HugeTextAnalyzer} class. + * @return a defined instance + */ + public static String getHugeTextFileTypeName() { + if (hugeTextFileTypeName == null) { + String newValue = HugeTextAnalyzerFactory.DEFAULT_INSTANCE.getAnalyzer(). + getFileTypeName(); + hugeTextFileTypeName = newValue; + return newValue; + } + return hugeTextFileTypeName; + } + /** * Register a {@code FileAnalyzerFactory} instance. */ diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/data/HugeTextAnalyzer.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/data/HugeTextAnalyzer.java new file mode 100644 index 00000000000..118cd9a533f --- /dev/null +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/data/HugeTextAnalyzer.java @@ -0,0 +1,92 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * See LICENSE.txt included in this distribution for the specific + * language governing permissions and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at LICENSE.txt. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2020, Chris Fraire . + */ + +package org.opengrok.indexer.analysis.data; + +import org.apache.lucene.document.Document; +import org.opengrok.indexer.analysis.AnalyzerFactory; +import org.opengrok.indexer.analysis.FileAnalyzer; +import org.opengrok.indexer.analysis.OGKTextField; +import org.opengrok.indexer.analysis.StreamSource; +import org.opengrok.indexer.configuration.RuntimeEnvironment; +import org.opengrok.indexer.search.QueryBuilder; +import org.opengrok.indexer.util.LimitedReader; +import org.opengrok.indexer.util.IOUtils; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.io.Writer; +import java.nio.charset.StandardCharsets; + +/** + * Represents an analyzer for huge text data files that are not eligible for + * xref. + */ +public class HugeTextAnalyzer extends FileAnalyzer { + + /** + * Creates a new instance. + * @param factory defined instance for the analyzer + */ + protected HugeTextAnalyzer(AnalyzerFactory factory) { + super(factory); + } + + /** + * @return {@code null} as there is no aligned language + */ + @Override + public String getCtagsLang() { + return null; + } + + /** + * Gets a version number to be used to tag processed documents so that + * re-analysis can be re-done later if a stored version number is different + * from the current implementation. + * @return 20200415_00 + */ + @Override + protected int getSpecializedVersionNo() { + return 20200415_00; // Edit comment above too! + } + + @Override + public void analyze(Document doc, StreamSource src, Writer xrefOut) throws IOException { + /* + * Though we don't intend to xref, Lucene demands consistency or else it + * would throw IllegalArgumentException: cannot change field "full" from + * index options=DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS to + * inconsistent index options=DOCS_AND_FREQS_AND_POSITIONS + */ + doc.add(new OGKTextField(QueryBuilder.FULL, getReader(src.getStream()))); + } + + protected Reader getReader(InputStream stream) throws IOException { + // sourceRoot is read with UTF-8 as a default. + return new LimitedReader(IOUtils.createBOMStrippedReader(stream, + StandardCharsets.UTF_8.name()), + RuntimeEnvironment.getInstance().getHugeTextLimitCharacters()); + } +} diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/data/HugeTextAnalyzerFactory.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/data/HugeTextAnalyzerFactory.java new file mode 100644 index 00000000000..f3f84651e1d --- /dev/null +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/data/HugeTextAnalyzerFactory.java @@ -0,0 +1,54 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * See LICENSE.txt included in this distribution for the specific + * language governing permissions and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at LICENSE.txt. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2020, Chris Fraire . + */ + +package org.opengrok.indexer.analysis.data; + +import org.opengrok.indexer.analysis.AbstractAnalyzer; +import org.opengrok.indexer.analysis.FileAnalyzerFactory; + +/** + * Represents a factory for creating {@link HugeTextAnalyzer} instances. + */ +public class HugeTextAnalyzerFactory extends FileAnalyzerFactory { + + private static final String NAME = "Huge Text"; + + /** + * Gets a factory instance with no associated file extensions nor magic nor + * any other mapping attribute. + */ + public static final HugeTextAnalyzerFactory DEFAULT_INSTANCE = new HugeTextAnalyzerFactory(); + + private HugeTextAnalyzerFactory() { + super(null, null, null, null, null, null, AbstractAnalyzer.Genre.DATA, NAME); + } + + /** + * Creates a new {@link HugeTextAnalyzer} instance. + * @return a defined instance + */ + @Override + protected AbstractAnalyzer newAnalyzer() { + return new HugeTextAnalyzer(this); + } +} diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/configuration/Configuration.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/configuration/Configuration.java index 666929ffa9f..2df3e93abaa 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/configuration/Configuration.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/configuration/Configuration.java @@ -76,6 +76,8 @@ public final class Configuration { private static final Logger LOGGER = LoggerFactory.getLogger(Configuration.class); public static final String PLUGIN_DIRECTORY_DEFAULT = "plugins"; + public static final int HUGE_TEXT_THRESHOLD_BYTES_DEFAULT = 1_000_000; + public static final int HUGE_TEXT_LIMIT_CHARACTERS_DEFAULT = 5_000_000; /** * A check if a pattern contains at least one pair of parentheses meaning @@ -301,6 +303,9 @@ public final class Configuration { private Set disabledRepositories; + private int hugeTextThresholdBytes; + private int hugeTextLimitCharacters; + /* * types of handling history for remote SCM repositories: * ON - index history and display it in webapp @@ -526,6 +531,8 @@ public Configuration() { setHistoryCacheTime(30); setHistoryEnabled(true); setHitsPerPage(25); + setHugeTextLimitCharacters(HUGE_TEXT_LIMIT_CHARACTERS_DEFAULT); + setHugeTextThresholdBytes(HUGE_TEXT_THRESHOLD_BYTES_DEFAULT); setIgnoredNames(new IgnoredNames()); setIncludedNames(new Filter()); setIndexVersionedFilesOnly(false); @@ -1323,6 +1330,37 @@ public void setDisabledRepositories(Set disabledRepositories) { this.disabledRepositories = disabledRepositories; } + /** + * Gets the number of bytes at which a plain-text file will be analyzed + * as a huge text data file and be ineligible for xref. Default is 1_000_000. + */ + public int getHugeTextThresholdBytes() { + return hugeTextThresholdBytes; + } + + /** + * Sets the number of bytes at which a plain-text file will be analyzed + * as a huge text data file and be ineligible for xref. + */ + public void setHugeTextThresholdBytes(int value) { + hugeTextThresholdBytes = Math.max(value, 0); + } + + /** + * Gets the number of characters to analyze from a huge text data file. + * Default is 5_000_000. + */ + public int getHugeTextLimitCharacters() { + return hugeTextLimitCharacters; + } + + /** + * Sets the number of characters to analyze from a huge text data file. + */ + public void setHugeTextLimitCharacters(int value) { + hugeTextLimitCharacters = Math.max(value, 0); + } + /** * Write the current configuration to a file. * diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/configuration/RuntimeEnvironment.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/configuration/RuntimeEnvironment.java index 652df3ca690..69b0bb23d0d 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/configuration/RuntimeEnvironment.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/configuration/RuntimeEnvironment.java @@ -1342,6 +1342,38 @@ public void setDisabledRepositories(Set disabledRepositories) { syncWriteConfiguration(disabledRepositories, Configuration::setDisabledRepositories); } + /** + * Gets the configured number of bytes at which a plain-text file will be + * analyzed as a huge text data file and be ineligible for xref. + */ + public int getHugeTextThresholdBytes() { + return syncReadConfiguration(Configuration::getHugeTextThresholdBytes); + } + + /** + * Sets the configured number of bytes at which a plain-text file will be + * analyzed as a huge text data file and be ineligible for xref. + */ + public void setHugeTextThresholdBytes(int hugeTextThresholdBytes) { + syncWriteConfiguration(hugeTextThresholdBytes, Configuration::setHugeTextThresholdBytes); + } + + /** + * Gets the configured number of characters to analyze from a huge text + * data file. + */ + public int getHugeTextLimitCharacters() { + return syncReadConfiguration(Configuration::getHugeTextLimitCharacters); + } + + /** + * Sets the configured number of characters to analyze from a huge text + * data file. + */ + public void setHugeTextLimitCharacters(int hugeTextLimitCharacters) { + syncWriteConfiguration(hugeTextLimitCharacters, Configuration::setHugeTextLimitCharacters); + } + /** * Read an configuration file and set it as the current configuration. * diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexDatabase.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexDatabase.java index 872955ee976..86c2905f2be 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexDatabase.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexDatabase.java @@ -88,6 +88,7 @@ import org.opengrok.indexer.analysis.AnalyzerGuru; import org.opengrok.indexer.analysis.Ctags; import org.opengrok.indexer.analysis.Definitions; +import org.opengrok.indexer.analysis.data.HugeTextAnalyzerFactory; import org.opengrok.indexer.configuration.PathAccepter; import org.opengrok.indexer.configuration.Project; import org.opengrok.indexer.configuration.RuntimeEnvironment; @@ -709,6 +710,11 @@ private void addFile(File file, String path, Ctags ctags) RuntimeEnvironment env = RuntimeEnvironment.getInstance(); AbstractAnalyzer fa = getAnalyzerFor(file, path); + if (AbstractAnalyzer.Genre.PLAIN.equals(fa.getGenre()) && + file.length() >= env.getHugeTextThresholdBytes()) { + fa = HugeTextAnalyzerFactory.DEFAULT_INSTANCE.getAnalyzer(); + } + for (IndexChangedListener listener : listeners) { listener.fileAdd(path, fa.getClass().getSimpleName()); } @@ -1708,14 +1714,14 @@ private void finishWriting() throws IOException { } /** - * Verify TABSIZE, and evaluate AnalyzerGuru version together with ZVER -- - * or return a value to indicate mismatch. + * Verify TABSIZE, validate AnalyzerGuru version together with Analyzer + * version, and recheck huge text file constraint -- or return a value to + * indicate mismatch. * @param file the source file object * @param path the source file path * @return {@code false} if a mismatch is detected */ - private boolean checkSettings(File file, - String path) throws IOException { + private boolean checkSettings(File file, String path) throws IOException { RuntimeEnvironment env = RuntimeEnvironment.getInstance(); boolean outIsXrefWriter = false; @@ -1759,8 +1765,7 @@ private boolean checkSettings(File file, break; } - AnalyzerFactory fac = - AnalyzerGuru.findByFileTypeName(fileTypeName); + AnalyzerFactory fac = AnalyzerGuru.findByFileTypeName(fileTypeName); if (fac != null) { fa = fac.getAnalyzer(); } @@ -1795,7 +1800,27 @@ private boolean checkSettings(File file, return false; } + // If it is a Huge Text file, re-check constraints. + if (AnalyzerGuru.getHugeTextFileTypeName().equals(fileTypeName) && + file.length() < env.getHugeTextThresholdBytes()) { + if (LOGGER.isLoggable(Level.FINE)) { + LOGGER.log(Level.FINE, "{0} no longer qualifies: {1}", + new Object[]{fileTypeName, path}); + } + return false; + } + if (fa != null) { + // If the Genre is PLAIN, re-check Huge Text file constraints. + if (AbstractAnalyzer.Genre.PLAIN.equals(fa.getGenre()) && + file.length() >= env.getHugeTextThresholdBytes()) { + if (LOGGER.isLoggable(Level.FINE)) { + LOGGER.log(Level.FINE, "{0} is now a huge text file: {1}", + new Object[]{fileTypeName, path}); + } + return false; + } + outIsXrefWriter = isXrefWriter(fa); } diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/index/Indexer.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/index/Indexer.java index 9e8e2ac41bd..f7e9db95558 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/index/Indexer.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/index/Indexer.java @@ -432,7 +432,18 @@ public static String[] parseOptions(String[] argv) throws ParseException { searchPaths.clear(); - // Limit usage lines to 72 characters for concise formatting. + /* + * FOR CONCISE FORMATTING, LIMIT USAGE DESCRIPTION LINES TO + * + * 8888888888 .d8888b. 888 + * d88P d88P Y88b 888 + * d88P 888 888 + * d88P .d88P .d8888b 88888b. 8888b. 888d888 .d8888b + * 88888888 .od888P" d88P" 888 "88b "88b 888P" 88K + * d88P d88P" 888 888 888 .d888888 888 "Y8888b. + * d88P 888" Y88b. 888 888 888 888 888 X88 + * d88P 888888888 "Y8888P 888 888 "Y888888 888 88888P' + */ optParser = OptionParser.execute(parser -> { parser.setPrologue( @@ -550,16 +561,27 @@ public static String[] parseOptions(String[] argv) throws ParseException { parser.on("-H", "--history", "Enable history.").execute(v -> cfg.setHistoryEnabled(true)); + parser.on("--historyRenamedThreads", "=number", Integer.class, + "The number of threads to use for history cache generation when dealing", + "with renamed files. By default the number of threads will be set to the", + "number of available CPUs. Assumes --renamedHistory=on").execute(threadCount -> + cfg.setHistoryRenamedParallelism((Integer) threadCount)); + parser.on("--historyThreads", "=number", Integer.class, - "The number of threads to use for history cache generation. By default the number", - "of threads will be set to the number of available CPUs. Assumes -H/--history.").execute(threadCount -> + "The number of threads to use for history cache generation. By default", + "the number of threads will be set to the number of available CPUs.", + "Assumes -H/--history.").execute(threadCount -> cfg.setHistoryParallelism((Integer) threadCount)); - parser.on("--historyRenamedThreads", "=number", Integer.class, - "The number of threads to use for history cache generation when dealing with renamed files.", - "By default the number of threads will be set to the number of available CPUs.", - "Assumes --renamedHistory=on").execute(threadCount -> - cfg.setHistoryRenamedParallelism((Integer) threadCount)); + parser.on("--hugeBytes", "=number", Integer.class, + "Threshold number of bytes to qualify a Huge Text data file vs a plain-", + "text source code file. Default is 1_000_000.").execute(value -> + cfg.setHugeTextThresholdBytes((int) value)); + + parser.on("--hugeCharacters", "=number", Integer.class, + "Limit for number of characters to read and index from a Huge Text data", + "Assumes --renamedHistory=on").execute(value -> + cfg.setHugeTextLimitCharacters((int) value)); parser.on("-I", "--include", "=pattern", "Only files matching this pattern will be examined. Pattern supports", diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/util/LimitedReader.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/util/LimitedReader.java new file mode 100644 index 00000000000..5c106e03439 --- /dev/null +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/util/LimitedReader.java @@ -0,0 +1,86 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * See LICENSE.txt included in this distribution for the specific + * language governing permissions and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at LICENSE.txt. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2020, Chris Fraire . + */ + +package org.opengrok.indexer.util; + +import java.io.IOException; +import java.io.Reader; + +/** + * Represents a {@link Reader} wrapper that limits characters read as specified + * and only up to {@link Integer#MAX_VALUE} to accommodate Lucene offset limits. + */ +public class LimitedReader extends Reader { + + private final int characterLimit; + private final Reader underlying; + private int characterCount; + private boolean didEOF; + + /** + * Initializes a new instance to wrap the specified {@code underlying}. + * @param underlying a defined instance + * @param characterLimit a non-negative number or alternatively a negative + * number to indicate {@link Integer#MAX_VALUE} + */ + public LimitedReader(Reader underlying, int characterLimit) { + if (underlying == null) { + throw new IllegalArgumentException("underlying is null"); + } + this.underlying = underlying; + this.characterLimit = characterLimit < 0 ? Integer.MAX_VALUE : characterLimit; + } + + /** + * Calls {@link Reader#read()} on the underlying {@link Reader} but only + * up to {@code characterLimit}, after which EOF will be indicated. + * @return The number of characters read, or -1 if the end of the stream or + * the {@code characterLimit} has been reached + */ + @Override + public int read(char[] cbuf, int off, int len) throws IOException { + if (didEOF) { + return -1; + } + + int adjustedLen = Math.min(len, characterLimit - characterCount); + int ret = underlying.read(cbuf, off, adjustedLen); + if (ret < 0) { + didEOF = true; + return -1; + } + characterCount += ret; + if (characterCount >= characterLimit) { + didEOF = true; + } + return ret; + } + + /** + * Calls {@link Reader#close()} on the underlying {@link Reader}. + */ + @Override + public void close() throws IOException { + underlying.close(); + } +} diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/web/SearchHelper.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/web/SearchHelper.java index c9be44dd108..ca38ff60ad9 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/web/SearchHelper.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/web/SearchHelper.java @@ -641,12 +641,11 @@ public void destroy() { /** * Searches for a document for a single file from the index. * @param file the file whose definitions to find - * @return {@link ScoreDoc#doc} or -1 if it could not be found + * @return a defined instance or {@code null} if not found * @throws IOException if an error happens when accessing the index * @throws ParseException if an error happens when building the Lucene query */ - public int searchSingle(File file) throws IOException, - ParseException { + public SingleResult searchSingleResult(File file) throws IOException, ParseException { RuntimeEnvironment env = RuntimeEnvironment.getInstance(); String path; @@ -654,7 +653,7 @@ public int searchSingle(File file) throws IOException, path = env.getPathRelativeToSourceRoot(file); } catch (ForbiddenSymlinkException e) { LOGGER.log(Level.FINER, e.getMessage()); - return -1; + return null; } //sanitize windows path delimiters //in order not to conflict with Lucene escape character @@ -668,7 +667,7 @@ public int searchSingle(File file) throws IOException, TopDocs top = searcher.search(query, 1); if (top.totalHits.value == 0) { - return -1; + return null; } int docID = top.scoreDocs[0].doc; @@ -677,10 +676,41 @@ public int searchSingle(File file) throws IOException, String foundPath = doc.get(QueryBuilder.PATH); // Only use the result if PATH matches exactly. if (!path.equals(foundPath)) { - return -1; + return null; } - return docID; + return new SingleResult(doc, docID); + } + + /** + * Searches for a document for a single file from the index. + * @param file the file whose definitions to find + * @return {@link ScoreDoc#doc} or -1 if it could not be found + * @throws IOException if an error happens when accessing the index + * @throws ParseException if an error happens when building the Lucene query + */ + public int searchSingle(File file) throws IOException, ParseException { + SingleResult result = searchSingleResult(file); + if (result != null) { + return result.getDocID(); + } + return -1; + } + + /** + * Searches for a document for a single file from the index to retrieve its + * {@link AbstractAnalyzer.Genre}. + * @param file the file whose definitions to find + * @return a defined instance or {@code null} if not found + * @throws IOException if an error happens when accessing the index + * @throws ParseException if an error happens when building the Lucene query + */ + public AbstractAnalyzer.Genre searchSingleGenre(File file) throws IOException, ParseException { + SingleResult result = searchSingleResult(file); + if (result != null) { + return AbstractAnalyzer.Genre.get(result.getDocument().get(QueryBuilder.T)); + } + return null; } /** diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/web/SingleResult.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/web/SingleResult.java new file mode 100644 index 00000000000..2e1055693ae --- /dev/null +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/web/SingleResult.java @@ -0,0 +1,47 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * See LICENSE.txt included in this distribution for the specific + * language governing permissions and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at LICENSE.txt. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2020, Chris Fraire . + */ + +package org.opengrok.indexer.web; + +import org.apache.lucene.document.Document; + +/** + * Represents a single-document search result. + */ +public class SingleResult { + private final Document document; + private final int docID; + + public SingleResult(Document document, int docID) { + this.document = document; + this.docID = docID; + } + + public Document getDocument() { + return document; + } + + public int getDocID() { + return docID; + } +} diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/index/HugeTextTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/index/HugeTextTest.java new file mode 100644 index 00000000000..622ffc1cc84 --- /dev/null +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/index/HugeTextTest.java @@ -0,0 +1,185 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * See LICENSE.txt included in this distribution for the specific + * language governing permissions and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at LICENSE.txt. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2008, 2018, Oracle and/or its affiliates. All rights reserved. + * Portions Copyright (c) 2017-2020, Chris Fraire . + * Portions Copyright (c) 2020, Ric Harris . + */ + +package org.opengrok.indexer.index; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import org.junit.After; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Rule; +import org.junit.Test; +import org.opengrok.indexer.condition.ConditionalRunRule; +import org.opengrok.indexer.configuration.Project; +import org.opengrok.indexer.configuration.RuntimeEnvironment; +import org.opengrok.indexer.history.RepositoryFactory; +import org.opengrok.indexer.util.TestRepository; + +import java.io.IOException; +import java.util.Queue; +import java.util.concurrent.ConcurrentLinkedQueue; + +/** + * @author Trond Norbye + */ +public class HugeTextTest { + + private static RuntimeEnvironment env; + private TestRepository repository; + private int savedHugeTextLimitCharacters; + private int savedHugeTextThresholdBytes; + + @Rule + public ConditionalRunRule rule = new ConditionalRunRule(); + + @BeforeClass + public static void setUpClass() { + env = RuntimeEnvironment.getInstance(); + RepositoryFactory.initializeIgnoredNames(env); + } + + @Before + public void setUp() throws IOException { + repository = new TestRepository(); + repository.create(HugeTextTest.class.getResourceAsStream("source.zip")); + + savedHugeTextLimitCharacters = env.getHugeTextLimitCharacters(); + savedHugeTextThresholdBytes = env.getHugeTextThresholdBytes(); + } + + @After + public void tearDown() { + repository.destroy(); + + env.setHugeTextLimitCharacters(savedHugeTextLimitCharacters); + env.setHugeTextThresholdBytes(savedHugeTextThresholdBytes); + } + + @Test + public void shouldIndexFilesPerChangingHugeTextSettings() throws Exception { + env.setSourceRoot(repository.getSourceRoot()); + env.setDataRoot(repository.getDataRoot()); + env.setRepositories(repository.getSourceRoot()); + + Project project = new Project("sql"); + project.setPath("/sql"); + + IndexDatabase idb = new IndexDatabase(project); + ConcurrentIndexChangeListener listener = new ConcurrentIndexChangeListener(); + idb.addIndexChangedListener(listener); + idb.update(); + assertEquals("should add expected files",2, listener.addedFiles.size()); + assertTrue("removedFiles should be empty", listener.removedFiles.isEmpty()); + assertTrue("should have added /sql/test.sql", listener.addedFiles.contains( + new AddedFile("/sql/test.sql", "SQLAnalyzer"))); + assertTrue("should have added /sql/test.sql", listener.addedFiles.contains( + new AddedFile("/sql/bug18586.sql", "SQLAnalyzer"))); + + env.setHugeTextThresholdBytes(300); + listener.reset(); + idb.update(); + assertEquals("should add expected files",1, listener.addedFiles.size()); + assertEquals("should remove expected files",1, listener.removedFiles.size()); + assertTrue("should have added /sql/test.sql", listener.addedFiles.contains( + new AddedFile("/sql/test.sql", "HugeTextAnalyzer"))); + assertTrue("should have removed /sql/test.sql", listener.removedFiles.contains( + "/sql/test.sql")); + + env.setHugeTextThresholdBytes(savedHugeTextThresholdBytes); + listener.reset(); + idb.update(); + assertEquals("should add expected files",1, listener.addedFiles.size()); + assertEquals("should remove expected files",1, listener.removedFiles.size()); + assertTrue("should have added /sql/test.sql", listener.addedFiles.contains( + new AddedFile("/sql/test.sql", "SQLAnalyzer"))); + assertTrue("should have removed /sql/test.sql", listener.removedFiles.contains( + "/sql/test.sql")); + } + + private static class ConcurrentIndexChangeListener implements IndexChangedListener { + + final Queue addedFiles = new ConcurrentLinkedQueue<>(); + final Queue removedFiles = new ConcurrentLinkedQueue<>(); + + @Override + public void fileAdd(String path, String analyzer) { + } + + @Override + public void fileAdded(String path, String analyzer) { + addedFiles.add(new AddedFile(path, analyzer)); + } + + @Override + public void fileRemove(String path) { + } + + @Override + public void fileRemoved(String path) { + removedFiles.add(path); + } + + void reset() { + this.addedFiles.clear(); + this.removedFiles.clear(); + } + } + + private static class AddedFile { + final String path; + final String analyzer; + + AddedFile(String path, String analyzer) { + this.path = path; + this.analyzer = analyzer; + } + + /** Generated by IntelliJ. */ + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) { + return false; + } + + AddedFile addedFile = (AddedFile) o; + + if (!path.equals(addedFile.path)) { + return false; + } + return analyzer.equals(addedFile.analyzer); + } + + /** Generated by IntelliJ. */ + @Override + public int hashCode() { + int result = path.hashCode(); + result = 31 * result + analyzer.hashCode(); + return result; + } + } +} diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/util/LimitedReaderTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/util/LimitedReaderTest.java new file mode 100644 index 00000000000..c59b3adeffc --- /dev/null +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/util/LimitedReaderTest.java @@ -0,0 +1,72 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * See LICENSE.txt included in this distribution for the specific + * language governing permissions and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at LICENSE.txt. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2020, Chris Fraire . + */ + +package org.opengrok.indexer.util; + +import static org.junit.Assert.assertEquals; + +import org.junit.Test; + +import java.io.IOException; +import java.io.StringReader; + +/** + * Represents a container for tests of {@link LimitedReader}. + */ +public class LimitedReaderTest { + + private static final String LIPSUM = "Lorem ipsum dolor sit amet, consectetur adipiscing " + + "elit. Proin dignissim sollicitudin est vitae aliquam. Nam leo nisl, lobortis at " + + "finibus nec, dignissim sed augue. Nullam commodo libero lectus, ac scelerisque ante " + + "luctus ac. Praesent varius volutpat lacinia. Praesent nec vulputate eros."; + + @Test + public void shouldReadToMax() throws IOException { + String value = readToLimit(-1); + assertEquals("should read to max", LIPSUM, value); + } + + @Test + public void shouldReadToTruncated() throws IOException { + String value = readToLimit(10); + assertEquals("should read to truncated", "Lorem ipsu", value); + } + + @Test + public void shouldReadNone() throws IOException { + String value = readToLimit(0); + assertEquals("should read nothing", "", value); + } + + private static String readToLimit(int characterLimit) throws IOException { + StringBuilder b = new StringBuilder(); + char[] buf = new char[37]; + try (LimitedReader reader = new LimitedReader(new StringReader(LIPSUM), characterLimit)) { + int n; + while ((n = reader.read(buf, 0, buf.length)) != -1) { + b.append(buf, 0, n); + } + } + return b.toString(); + } +} diff --git a/opengrok-web/src/main/java/org/opengrok/web/PageConfig.java b/opengrok-web/src/main/java/org/opengrok/web/PageConfig.java index 0e8c4c6de90..f582ef97c3c 100644 --- a/opengrok-web/src/main/java/org/opengrok/web/PageConfig.java +++ b/opengrok-web/src/main/java/org/opengrok/web/PageConfig.java @@ -1488,6 +1488,11 @@ public File getDataRoot() { * executing the prepared query or continue processing. *

* This method stops populating fields as soon as an error occurs. + *

+ * The result is stored as a request attribute keyed to + * {@link SearchHelper#REQUEST_ATTR} for later cleanup via + * {@link SearchHelper#destroy()}. Any object already set will have + * {@link SearchHelper#destroy()} called. * * @return a search helper. */ @@ -1519,9 +1524,21 @@ public SearchHelper prepareSearch() { * executing the prepared query or continue processing. *

* This method stops populating fields as soon as an error occurs. + *

+ * The result is stored as a request attribute keyed to + * {@link SearchHelper#REQUEST_ATTR} for later cleanup via + * {@link SearchHelper#destroy()}. Any object already set will have + * {@link SearchHelper#destroy()} called. + * * @return a search helper. */ public SearchHelper prepareInternalSearch() { + Object cached = req.getAttribute(SearchHelper.REQUEST_ATTR); + if (cached != null) { + req.setAttribute(SearchHelper.REQUEST_ATTR, null); + ((SearchHelper) cached).destroy(); + } + SearchHelper sh = new SearchHelper(); sh.dataRoot = getDataRoot(); // throws Exception if none-existent sh.order = SortOrder.RELEVANCY; @@ -1537,6 +1554,13 @@ public SearchHelper prepareInternalSearch() { sh.sourceRoot = new File(getSourceRootPath()); String xrValue = req.getParameter(QueryParameters.NO_REDIRECT_PARAM); sh.noRedirect = xrValue != null && !xrValue.isEmpty(); + + /* + * N.b. searchHelper.destroy() is called via + * WebappListener.requestDestroyed() on presence of the following + * REQUEST_ATTR. + */ + req.setAttribute(SearchHelper.REQUEST_ATTR, sh); return sh; } diff --git a/opengrok-web/src/main/webapp/history.jsp b/opengrok-web/src/main/webapp/history.jsp index 97deda04873..63238282861 100644 --- a/opengrok-web/src/main/webapp/history.jsp +++ b/opengrok-web/src/main/webapp/history.jsp @@ -61,13 +61,12 @@ org.opengrok.indexer.web.Util" String primePath = path; Project project = cfg.getProject(); if (project != null) { - SearchHelper searchHelper = cfg.prepareInternalSearch(); /* * N.b. searchHelper.destroy() is called via - * WebappListener.requestDestroyed() on presence of the following - * REQUEST_ATTR. + * WebappListener.requestDestroyed() on presence of an attribute, + * REQUEST_ATTR, set by the following. */ - request.setAttribute(SearchHelper.REQUEST_ATTR, searchHelper); + SearchHelper searchHelper = cfg.prepareInternalSearch(); searchHelper.prepareExec(project); try { diff --git a/opengrok-web/src/main/webapp/list.jsp b/opengrok-web/src/main/webapp/list.jsp index 1e5aa346dba..3bbcf09678b 100644 --- a/opengrok-web/src/main/webapp/list.jsp +++ b/opengrok-web/src/main/webapp/list.jsp @@ -166,19 +166,13 @@ document.pageReady.push(function() { pageReadyList();}); List files = cfg.getResourceFileList(); if (!files.isEmpty()) { List extras = null; - SearchHelper searchHelper = cfg.prepareInternalSearch(); /* * N.b. searchHelper.destroy() is called via - * WebappListener.requestDestroyed() on presence of the following - * REQUEST_ATTR. + * WebappListener.requestDestroyed() on presence of an attribute, + * REQUEST_ATTR, set by the following. */ - request.setAttribute(SearchHelper.REQUEST_ATTR, searchHelper); - if (project != null) { - searchHelper.prepareExec(project); - } else { - //noinspection Convert2Diamond - searchHelper.prepareExec(new TreeSet()); - } + SearchHelper searchHelper = cfg.prepareInternalSearch(); + prepareExec(searchHelper, project); if (searchHelper.searcher != null) { DirectoryExtraReader extraReader = new DirectoryExtraReader(); @@ -302,8 +296,22 @@ Click download <%= basename %><% } } else { // requesting a previous revision or needed to generate xref on the fly (economy mode). + AnalyzerFactory a = AnalyzerGuru.find(basename); - Genre g = AnalyzerGuru.getGenre(a); + /* + * N.b. searchHelper.destroy() is called via + * WebappListener.requestDestroyed() on presence of an attribute, + * REQUEST_ATTR, set by the following. + */ + SearchHelper searchHelper = cfg.prepareInternalSearch(); + prepareExec(searchHelper, project); + Genre g = null; + if (searchHelper.searcher != null) { + g = searchHelper.searchSingleGenre(resourceFile); + } + if (g == null) { + g = AnalyzerGuru.getGenre(a); + } String error = null; if (g == Genre.PLAIN || g == Genre.HTML || g == null) { InputStream in = null; @@ -338,7 +346,7 @@ Click download <%= basename %><% if (g == AbstractAnalyzer.Genre.DATA || g == AbstractAnalyzer.Genre.XREFABLE || g == null) { %>

<% } else { @@ -401,7 +409,7 @@ Click download <%= basename %><% */ Util.dumpXref(out, r, request.getContextPath()); } else { - %>Download binary file, ?<%= QueryParameters.REVISION_PARAM_EQ %> <%= Util.URIEncode(rev) %>"><%= basename %><% } } @@ -439,7 +447,7 @@ Click download <%= basename %><% } else { %> <% } @@ -468,3 +476,13 @@ Click download <%= basename %><% include file="foot.jspf" %> +<%! + private static void prepareExec(SearchHelper searchHelper, Project project) { + if (project != null) { + searchHelper.prepareExec(project); + } else { + //noinspection Convert2Diamond + searchHelper.prepareExec(new TreeSet()); + } + } +%> diff --git a/opengrok-web/src/main/webapp/more.jsp b/opengrok-web/src/main/webapp/more.jsp index e24a91d285a..d80f6a2bb6f 100644 --- a/opengrok-web/src/main/webapp/more.jsp +++ b/opengrok-web/src/main/webapp/more.jsp @@ -63,13 +63,12 @@ file="mast.jsp" if (activeProject == null) { qbuilder = cfg.getQueryBuilder(); } else { - searchHelper = cfg.prepareInternalSearch(); /* * N.b. searchHelper.destroy() is called via - * WebappListener.requestDestroyed() on presence of the following - * REQUEST_ATTR. + * WebappListener.requestDestroyed() on presence of an attribute, + * REQUEST_ATTR, set by the following. */ - request.setAttribute(SearchHelper.REQUEST_ATTR, searchHelper); + searchHelper = cfg.prepareInternalSearch(); searchHelper.prepareExec(activeProject); if (searchHelper.searcher != null) { docId = searchHelper.searchSingle(resourceFile); diff --git a/opengrok-web/src/main/webapp/search.jsp b/opengrok-web/src/main/webapp/search.jsp index 84415b75111..5460acfad08 100644 --- a/opengrok-web/src/main/webapp/search.jsp +++ b/opengrok-web/src/main/webapp/search.jsp @@ -71,11 +71,12 @@ include file="projects.jspf" { PageConfig cfg = PageConfig.get(request); + /* + * N.b. searchHelper.destroy() is called via + * WebappListener.requestDestroyed() on presence of an attribute, + * REQUEST_ATTR, set by the following. + */ SearchHelper searchHelper = cfg.prepareSearch(); - // N.b. searchHelper.destroy() is called via - // WebappListener.requestDestroyed() on presence of the following - // REQUEST_ATTR. - request.setAttribute(SearchHelper.REQUEST_ATTR, searchHelper); searchHelper.prepareExec(cfg.getRequestedProjects()).executeQuery().prepareSummary(); // notify suggester that query was searched SuggesterServiceFactory.getDefault().onSearch(cfg.getRequestedProjects(), searchHelper.query); From f9ff866eb8bb67df7ddb421aa378776de6b393c2 Mon Sep 17 00:00:00 2001 From: Chris Fraire Date: Fri, 17 Apr 2020 22:10:59 -0500 Subject: [PATCH 07/15] Fix #2560 : recognize Huge Text in gzip or bzip2 --- .../analysis/archive/BZip2Analyzer.java | 42 +++---- .../analysis/archive/CompressedAnalyzer.java | 109 ++++++++++++++++++ .../analysis/archive/GZIPAnalyzer.java | 36 +----- 3 files changed, 128 insertions(+), 59 deletions(-) create mode 100644 opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/CompressedAnalyzer.java diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/BZip2Analyzer.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/BZip2Analyzer.java index c07e392afda..d6c065d6847 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/BZip2Analyzer.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/BZip2Analyzer.java @@ -27,14 +27,16 @@ import java.io.IOException; import java.io.InputStream; import java.io.Writer; +import java.util.logging.Level; +import java.util.logging.Logger; + import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; import org.apache.tools.bzip2.CBZip2InputStream; import org.opengrok.indexer.analysis.AbstractAnalyzer; import org.opengrok.indexer.analysis.AnalyzerFactory; import org.opengrok.indexer.analysis.AnalyzerGuru; -import org.opengrok.indexer.analysis.FileAnalyzer; import org.opengrok.indexer.analysis.StreamSource; +import org.opengrok.indexer.logger.LoggerFactory; import org.opengrok.indexer.search.QueryBuilder; /** @@ -43,17 +45,9 @@ * Created on September 22, 2005 * @author Chandan */ -public class BZip2Analyzer extends FileAnalyzer { - - private Genre g; +public class BZip2Analyzer extends CompressedAnalyzer { - @Override - public Genre getGenre() { - if (g != null) { - return g; - } - return super.getGenre(); - } + private static final Logger LOGGER = LoggerFactory.getLogger(BZip2Analyzer.class); protected BZip2Analyzer(AnalyzerFactory factory) { super(factory); @@ -71,11 +65,11 @@ public String getCtagsLang() { * Gets a version number to be used to tag processed documents so that * re-analysis can be re-done later if a stored version number is different * from the current implementation. - * @return 20180111_00 + * @return 20200417_00 */ @Override protected int getSpecializedVersionNo() { - return 20180111_00; // Edit comment above too! + return 20200417_00; // Edit comment above too! } @Override @@ -92,20 +86,12 @@ public void analyze(Document doc, StreamSource src, Writer xrefOut) try (InputStream in = bzSrc.getStream()) { fa = AnalyzerGuru.getAnalyzer(in, newname); } - if (!(fa instanceof BZip2Analyzer)) { - if (fa.getGenre() == Genre.PLAIN || fa.getGenre() == Genre.XREFABLE) { - this.g = Genre.XREFABLE; - } else { - this.g = Genre.DATA; - } - fa.analyze(doc, bzSrc, xrefOut); - if (doc.get(QueryBuilder.T) != null) { - doc.removeField(QueryBuilder.T); - if (g == Genre.XREFABLE) { - doc.add(new Field(QueryBuilder.T, g.typeName(), - AnalyzerGuru.string_ft_stored_nanalyzed_norms)); - } - } + if (fa == null) { + this.g = Genre.DATA; + LOGGER.log(Level.WARNING, "Did not analyze {0} detected as data.", newname); + //TODO we could probably wrap tar analyzer here, need to do research on reader coming from gzis ... + } else if (!(fa instanceof BZip2Analyzer)) { + analyzeUncompressed(doc, xrefOut, fa, bzSrc); } } } diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/CompressedAnalyzer.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/CompressedAnalyzer.java new file mode 100644 index 00000000000..4029654b000 --- /dev/null +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/CompressedAnalyzer.java @@ -0,0 +1,109 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * See LICENSE.txt included in this distribution for the specific + * language governing permissions and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at LICENSE.txt. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2018, Oracle and/or its affiliates. All rights reserved. + * Portions Copyright (c) 2017-2020, Chris Fraire . + */ + +package org.opengrok.indexer.analysis.archive; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.opengrok.indexer.analysis.AbstractAnalyzer; +import org.opengrok.indexer.analysis.AnalyzerFactory; +import org.opengrok.indexer.analysis.AnalyzerGuru; +import org.opengrok.indexer.analysis.FileAnalyzer; +import org.opengrok.indexer.analysis.StreamSource; +import org.opengrok.indexer.analysis.data.HugeTextAnalyzerFactory; +import org.opengrok.indexer.configuration.RuntimeEnvironment; +import org.opengrok.indexer.search.QueryBuilder; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Writer; + +/** + * Represents a base for compressed formats (e.g. gzip or bzip2) but not for + * archive formats that have compression (e.g. Zip or Jar). + * @author Chandan + */ +public abstract class CompressedAnalyzer extends FileAnalyzer { + + protected Genre g; + + @Override + public Genre getGenre() { + if (g != null) { + return g; + } + return super.getGenre(); + } + + protected CompressedAnalyzer(AnalyzerFactory factory) { + super(factory); + } + + protected void analyzeUncompressed( + Document doc, Writer xrefOut, AbstractAnalyzer fa, StreamSource compressedSrc) + throws IOException, InterruptedException { + + if (fa.getGenre() == Genre.PLAIN) { + if (meetsHugeTextThreshold(compressedSrc)) { + fa = HugeTextAnalyzerFactory.DEFAULT_INSTANCE.getAnalyzer(); + g = Genre.DATA; + } else { + g = Genre.XREFABLE; + } + } else if (fa.getGenre() == Genre.XREFABLE) { + g = Genre.XREFABLE; + } else { + g = Genre.DATA; + } + + fa.analyze(doc, compressedSrc, xrefOut); + if (doc.get(QueryBuilder.T) != null) { + doc.removeField(QueryBuilder.T); + } + doc.add(new Field(QueryBuilder.T, g.typeName(), + AnalyzerGuru.string_ft_stored_nanalyzed_norms)); + } + + private boolean meetsHugeTextThreshold(StreamSource compressedSrc) throws IOException { + RuntimeEnvironment env = RuntimeEnvironment.getInstance(); + int hugeTextThresholdBytes = env.getHugeTextThresholdBytes(); + if (Integer.MAX_VALUE == hugeTextThresholdBytes) { + // Don't bother decompressing to count if the limit is MAX_VALUE. + return false; + } + + byte[] buf = new byte[8 * 1024]; + int bytesRead = 0; + int n; + try (InputStream in = compressedSrc.getStream()) { + while ((n = in.read(buf, 0, buf.length)) != -1) { + bytesRead += n; + if (bytesRead >= hugeTextThresholdBytes) { + return true; + } + } + } + return false; + } +} diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/GZIPAnalyzer.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/GZIPAnalyzer.java index 839e6594d59..bbee4082c0e 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/GZIPAnalyzer.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/GZIPAnalyzer.java @@ -32,11 +32,9 @@ import java.util.logging.Logger; import java.util.zip.GZIPInputStream; import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; import org.opengrok.indexer.analysis.AbstractAnalyzer; import org.opengrok.indexer.analysis.AnalyzerFactory; import org.opengrok.indexer.analysis.AnalyzerGuru; -import org.opengrok.indexer.analysis.FileAnalyzer; import org.opengrok.indexer.analysis.StreamSource; import org.opengrok.indexer.logger.LoggerFactory; import org.opengrok.indexer.search.QueryBuilder; @@ -47,20 +45,10 @@ * Created on September 22, 2005 * @author Chandan */ -public class GZIPAnalyzer extends FileAnalyzer { +public class GZIPAnalyzer extends CompressedAnalyzer { private static final Logger LOGGER = LoggerFactory.getLogger(GZIPAnalyzer.class); - private Genre g; - - @Override - public Genre getGenre() { - if (g != null) { - return g; - } - return super.getGenre(); - } - protected GZIPAnalyzer(AnalyzerFactory factory) { super(factory); } @@ -77,11 +65,11 @@ public String getCtagsLang() { * Gets a version number to be used to tag processed documents so that * re-analysis can be re-done later if a stored version number is different * from the current implementation. - * @return 20180111_00 + * @return 20200417_00 */ @Override protected int getSpecializedVersionNo() { - return 20180111_00; // Edit comment above too! + return 20200417_00; // Edit comment above too! } @Override @@ -93,30 +81,16 @@ public void analyze(Document doc, StreamSource src, Writer xrefOut) String path = doc.get(QueryBuilder.PATH); if (path != null && path.toLowerCase(Locale.ROOT).endsWith(".gz")) { String newname = path.substring(0, path.length() - 3); - //System.err.println("GZIPPED OF = " + newname); try (InputStream gzis = gzSrc.getStream()) { fa = AnalyzerGuru.getAnalyzer(gzis, newname); } if (fa == null) { this.g = Genre.DATA; - LOGGER.log(Level.WARNING, "Did not analyze {0}, detected as data.", newname); + LOGGER.log(Level.WARNING, "Did not analyze {0} detected as data.", newname); //TODO we could probably wrap tar analyzer here, need to do research on reader coming from gzis ... } else { // cant recurse! //simple file gziped case captured here - if (fa.getGenre() == Genre.PLAIN || fa.getGenre() == Genre.XREFABLE) { - this.g = Genre.XREFABLE; - } else { - this.g = Genre.DATA; - } - fa.analyze(doc, gzSrc, xrefOut); - if (doc.get(QueryBuilder.T) != null) { - doc.removeField(QueryBuilder.T); - if (g == Genre.XREFABLE) { - doc.add(new Field(QueryBuilder.T, g.typeName(), - AnalyzerGuru.string_ft_stored_nanalyzed_norms)); - } - } - + analyzeUncompressed(doc, xrefOut, fa, gzSrc); } } } From 9f3955832408951e9fb052b4b56eda1bb7eb470f Mon Sep 17 00:00:00 2001 From: Chris Fraire Date: Sat, 18 Apr 2020 10:36:45 -0500 Subject: [PATCH 08/15] Attempt InputStream.skip() for more efficiency --- .../analysis/archive/CompressedAnalyzer.java | 41 +++++++++++++++++-- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/CompressedAnalyzer.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/CompressedAnalyzer.java index 4029654b000..06f00e7a80c 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/CompressedAnalyzer.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/CompressedAnalyzer.java @@ -46,6 +46,8 @@ */ public abstract class CompressedAnalyzer extends FileAnalyzer { + private static final int CHUNK_SIZE = 8 * 1024; + protected Genre g; @Override @@ -93,10 +95,17 @@ private boolean meetsHugeTextThreshold(StreamSource compressedSrc) throws IOExce return false; } - byte[] buf = new byte[8 * 1024]; - int bytesRead = 0; - int n; try (InputStream in = compressedSrc.getStream()) { + // Try skip first. + SkipResult result = meetsHugeTextThresholdBySkip(in, hugeTextThresholdBytes); + if (result.didMeet) { + return true; + } + + // Even if some skipped, only read==-1 is a true indicator of EOF. + long bytesRead = result.bytesSkipped; + byte[] buf = new byte[CHUNK_SIZE]; + long n; while ((n = in.read(buf, 0, buf.length)) != -1) { bytesRead += n; if (bytesRead >= hugeTextThresholdBytes) { @@ -106,4 +115,30 @@ private boolean meetsHugeTextThreshold(StreamSource compressedSrc) throws IOExce } return false; } + + private SkipResult meetsHugeTextThresholdBySkip(InputStream in, int hugeTextThresholdBytes) { + long bytesSkipped = 0; + long n; + try { + while ((n = in.skip(CHUNK_SIZE)) > 0) { + bytesSkipped += n; + if (bytesSkipped >= hugeTextThresholdBytes) { + return new SkipResult(bytesSkipped, true); + } + } + } catch (IOException ignored) { + // Ignore and assume not capable of skip. + } + return new SkipResult(bytesSkipped, false); + } + + private static class SkipResult { + final long bytesSkipped; + final boolean didMeet; + + SkipResult(long bytesSkipped, boolean didMeet) { + this.bytesSkipped = bytesSkipped; + this.didMeet = didMeet; + } + } } From 1f5d48431b0ddfec6e9891de41d682f6587b63d6 Mon Sep 17 00:00:00 2001 From: Chris Fraire Date: Sat, 18 Apr 2020 16:57:54 -0500 Subject: [PATCH 09/15] Tweak test to work on Windows --- .../test/java/org/opengrok/indexer/index/HugeTextTest.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/index/HugeTextTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/index/HugeTextTest.java index 622ffc1cc84..3cd55d1f0a1 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/index/HugeTextTest.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/index/HugeTextTest.java @@ -38,6 +38,7 @@ import org.opengrok.indexer.configuration.RuntimeEnvironment; import org.opengrok.indexer.history.RepositoryFactory; import org.opengrok.indexer.util.TestRepository; +import org.opengrok.indexer.web.Util; import java.io.IOException; import java.util.Queue; @@ -131,7 +132,7 @@ public void fileAdd(String path, String analyzer) { @Override public void fileAdded(String path, String analyzer) { - addedFiles.add(new AddedFile(path, analyzer)); + addedFiles.add(new AddedFile(Util.fixPathIfWindows(path), analyzer)); } @Override @@ -140,7 +141,7 @@ public void fileRemove(String path) { @Override public void fileRemoved(String path) { - removedFiles.add(path); + removedFiles.add(Util.fixPathIfWindows(path)); } void reset() { From 15458dab3e5548f20f4d89f1b236e3026d2aed09 Mon Sep 17 00:00:00 2001 From: Chris Fraire Date: Tue, 21 Apr 2020 17:02:13 -0500 Subject: [PATCH 10/15] Show WARNINGs about content classified as Huge Text --- .../opengrok/indexer/analysis/StreamSource.java | 17 ++++++++++++++++- .../indexer/analysis/archive/BZip2Analyzer.java | 5 +++++ .../analysis/archive/CompressedAnalyzer.java | 10 ++++++++++ .../indexer/analysis/archive/GZIPAnalyzer.java | 5 +++++ .../opengrok/indexer/index/IndexDatabase.java | 13 +++++++++---- .../indexer/analysis/JFlexXrefTest.java | 7 ++++++- .../indexer/analysis/TextAnalyzerTest.java | 7 ++++++- .../analysis/c/CAnalyzerFactoryTest.java | 13 ++----------- .../analysis/c/CxxAnalyzerFactoryTest.java | 13 ++----------- .../clojure/ClojureAnalyzerFactoryTest.java | 13 ++----------- .../csharp/CSharpAnalyzerFactoryTest.java | 13 ++----------- .../analysis/document/TroffAnalyzerTest.java | 6 ++++++ .../analysis/java/JavaAnalyzerFactoryTest.java | 13 ++----------- .../pascal/PascalAnalyzerFactoryTest.java | 13 ++----------- .../plain/DefinitionsTokenStreamTest.java | 7 ++++++- .../org/opengrok/indexer/util/StreamUtils.java | 7 ++++++- 16 files changed, 87 insertions(+), 75 deletions(-) diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/StreamSource.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/StreamSource.java index 7017224f715..535b83979da 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/StreamSource.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/StreamSource.java @@ -19,7 +19,7 @@ /* * Copyright (c) 2013, 2018 Oracle and/or its affiliates. All rights reserved. - * Portions Copyright (c) 2018, Chris Fraire . + * Portions Copyright (c) 2018, 2020, Chris Fraire . */ package org.opengrok.indexer.analysis; @@ -52,6 +52,11 @@ public abstract class StreamSource { */ public abstract InputStream getStream() throws IOException; + /** + * Gets a reportable identifier of the source. + */ + public abstract String getSourceIdentifier(); + /** * Helper method that creates a {@code StreamSource} instance that * reads data from a file. @@ -65,6 +70,11 @@ public static StreamSource fromFile(final File file) { public InputStream getStream() throws IOException { return new BufferedInputStream(new FileInputStream(file)); } + + @Override + public String getSourceIdentifier() { + return file.getAbsolutePath(); + } }; } @@ -82,6 +92,11 @@ public static StreamSource fromString(final String str) { public InputStream getStream() throws IOException { return new ByteArrayInputStream(sbuf); } + + @Override + public String getSourceIdentifier() { + return "String"; + } }; } } diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/BZip2Analyzer.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/BZip2Analyzer.java index d6c065d6847..9ce77b11e20 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/BZip2Analyzer.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/BZip2Analyzer.java @@ -112,6 +112,11 @@ public InputStream getStream() throws IOException { throw new IOException("Not BZIP2 format"); } } + + @Override + public String getSourceIdentifier() { + return src.getSourceIdentifier(); + } }; } } diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/CompressedAnalyzer.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/CompressedAnalyzer.java index 06f00e7a80c..4306e2c5174 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/CompressedAnalyzer.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/CompressedAnalyzer.java @@ -33,11 +33,14 @@ import org.opengrok.indexer.analysis.StreamSource; import org.opengrok.indexer.analysis.data.HugeTextAnalyzerFactory; import org.opengrok.indexer.configuration.RuntimeEnvironment; +import org.opengrok.indexer.logger.LoggerFactory; import org.opengrok.indexer.search.QueryBuilder; import java.io.IOException; import java.io.InputStream; import java.io.Writer; +import java.util.logging.Level; +import java.util.logging.Logger; /** * Represents a base for compressed formats (e.g. gzip or bzip2) but not for @@ -46,6 +49,8 @@ */ public abstract class CompressedAnalyzer extends FileAnalyzer { + private static final Logger LOGGER = LoggerFactory.getLogger(CompressedAnalyzer.class); + private static final int CHUNK_SIZE = 8 * 1024; protected Genre g; @@ -68,8 +73,13 @@ protected void analyzeUncompressed( if (fa.getGenre() == Genre.PLAIN) { if (meetsHugeTextThreshold(compressedSrc)) { + String origFileTypeName = fa.getFileTypeName(); fa = HugeTextAnalyzerFactory.DEFAULT_INSTANCE.getAnalyzer(); g = Genre.DATA; + if (LOGGER.isLoggable(Level.WARNING)) { + LOGGER.log(Level.WARNING, "{0} is compressed huge text: {1}", + new Object[]{origFileTypeName, compressedSrc.getSourceIdentifier()}); + } } else { g = Genre.XREFABLE; } diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/GZIPAnalyzer.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/GZIPAnalyzer.java index bbee4082c0e..e735dc49724 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/GZIPAnalyzer.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/archive/GZIPAnalyzer.java @@ -105,6 +105,11 @@ public InputStream getStream() throws IOException { return new BufferedInputStream( new GZIPInputStream(src.getStream())); } + + @Override + public String getSourceIdentifier() { + return src.getSourceIdentifier(); + } }; } } diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexDatabase.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexDatabase.java index 86c2905f2be..7935ed6f41a 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexDatabase.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexDatabase.java @@ -712,7 +712,12 @@ private void addFile(File file, String path, Ctags ctags) if (AbstractAnalyzer.Genre.PLAIN.equals(fa.getGenre()) && file.length() >= env.getHugeTextThresholdBytes()) { + String origFileTypeName = fa.getFileTypeName(); fa = HugeTextAnalyzerFactory.DEFAULT_INSTANCE.getAnalyzer(); + if (LOGGER.isLoggable(Level.WARNING)) { + LOGGER.log(Level.WARNING, "{0} is huge text: {1}", + new Object[]{origFileTypeName, path}); + } } for (IndexChangedListener listener : listeners) { @@ -1803,8 +1808,8 @@ private boolean checkSettings(File file, String path) throws IOException { // If it is a Huge Text file, re-check constraints. if (AnalyzerGuru.getHugeTextFileTypeName().equals(fileTypeName) && file.length() < env.getHugeTextThresholdBytes()) { - if (LOGGER.isLoggable(Level.FINE)) { - LOGGER.log(Level.FINE, "{0} no longer qualifies: {1}", + if (LOGGER.isLoggable(Level.WARNING)) { + LOGGER.log(Level.WARNING, "{0} no longer qualifies: {1}", new Object[]{fileTypeName, path}); } return false; @@ -1814,8 +1819,8 @@ private boolean checkSettings(File file, String path) throws IOException { // If the Genre is PLAIN, re-check Huge Text file constraints. if (AbstractAnalyzer.Genre.PLAIN.equals(fa.getGenre()) && file.length() >= env.getHugeTextThresholdBytes()) { - if (LOGGER.isLoggable(Level.FINE)) { - LOGGER.log(Level.FINE, "{0} is now a huge text file: {1}", + if (LOGGER.isLoggable(Level.WARNING)) { + LOGGER.log(Level.WARNING, "{0} is now huge text: {1}", new Object[]{fileTypeName, path}); } return false; diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/JFlexXrefTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/JFlexXrefTest.java index d640c865f0c..59658c168ec 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/JFlexXrefTest.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/JFlexXrefTest.java @@ -19,7 +19,7 @@ /* * Copyright (c) 2010, 2019, Oracle and/or its affiliates. All rights reserved. - * Portions Copyright (c) 2017-2019, Chris Fraire . + * Portions Copyright (c) 2017-2020, Chris Fraire . */ package org.opengrok.indexer.analysis; @@ -505,6 +505,11 @@ public void testJavaClassAnalyzer() throws Exception { ".class"; return StringWriter.class.getResourceAsStream(path); } + + @Override + public String getSourceIdentifier() { + return "StringWriter.class"; + } }; Document doc = new Document(); StringWriter out = new StringWriter(); diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/TextAnalyzerTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/TextAnalyzerTest.java index 6c3409160d4..f0db02d7d31 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/TextAnalyzerTest.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/TextAnalyzerTest.java @@ -19,7 +19,7 @@ /* * Copyright (c) 2010, 2018, Oracle and/or its affiliates. All rights reserved. - * Portions Copyright (c) 2017, Chris Fraire . + * Portions Copyright (c) 2017, 2020, Chris Fraire . */ package org.opengrok.indexer.analysis; @@ -49,6 +49,11 @@ private static StreamSource getStreamSource(final byte[] bytes) { public InputStream getStream() throws IOException { return new ByteArrayInputStream(bytes); } + + @Override + public String getSourceIdentifier() { + return "byte[]"; + } }; } diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/c/CAnalyzerFactoryTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/c/CAnalyzerFactoryTest.java index 32d1ea44067..97fbddf8211 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/c/CAnalyzerFactoryTest.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/c/CAnalyzerFactoryTest.java @@ -19,7 +19,7 @@ /* * Copyright (c) 2015, 2018 Oracle and/or its affiliates. All rights reserved. - * Portions Copyright (c) 2017-2019, Chris Fraire . + * Portions Copyright (c) 2017-2020, Chris Fraire . */ package org.opengrok.indexer.analysis.c; @@ -60,15 +60,6 @@ public class CAnalyzerFactoryTest { private static TestRepository repository; private static AbstractAnalyzer analyzer; - private static StreamSource getStreamSource(final String fname) { - return new StreamSource() { - @Override - public InputStream getStream() throws IOException { - return new FileInputStream(fname); - } - }; - } - @BeforeClass public static void setUpClass() throws Exception { ctags = new Ctags(); @@ -110,7 +101,7 @@ public void testScopeAnalyzer() throws Exception { StringWriter xrefOut = new StringWriter(); analyzer.setCtags(ctags); analyzer.setScopesEnabled(true); - analyzer.analyze(doc, getStreamSource(path), xrefOut); + analyzer.analyze(doc, StreamSource.fromFile(f), xrefOut); IndexableField scopesField = doc.getField(QueryBuilder.SCOPES); assertNotNull(scopesField); diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/c/CxxAnalyzerFactoryTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/c/CxxAnalyzerFactoryTest.java index 365209353bb..3bf1125eb69 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/c/CxxAnalyzerFactoryTest.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/c/CxxAnalyzerFactoryTest.java @@ -19,7 +19,7 @@ /* * Copyright (c) 2015, 2018 Oracle and/or its affiliates. All rights reserved. - * Portions Copyright (c) 2017-2019, Chris Fraire . + * Portions Copyright (c) 2017-2020, Chris Fraire . */ package org.opengrok.indexer.analysis.c; @@ -60,15 +60,6 @@ public class CxxAnalyzerFactoryTest { private static TestRepository repository; private static AbstractAnalyzer analyzer; - private static StreamSource getStreamSource(final String fname) { - return new StreamSource() { - @Override - public InputStream getStream() throws IOException { - return new FileInputStream(fname); - } - }; - } - @BeforeClass public static void setUpClass() throws Exception { ctags = new Ctags(); @@ -111,7 +102,7 @@ public void testScopeAnalyzer() throws Exception { analyzer.setScopesEnabled(true); System.out.println(path); - analyzer.analyze(doc, getStreamSource(path), xrefOut); + analyzer.analyze(doc, StreamSource.fromFile(f), xrefOut); IndexableField scopesField = doc.getField(QueryBuilder.SCOPES); assertNotNull(scopesField); diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/clojure/ClojureAnalyzerFactoryTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/clojure/ClojureAnalyzerFactoryTest.java index 8852f0dca37..ee1cee1dfb5 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/clojure/ClojureAnalyzerFactoryTest.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/clojure/ClojureAnalyzerFactoryTest.java @@ -19,7 +19,7 @@ /* * Copyright (c) 2016, 2018 Oracle and/or its affiliates. All rights reserved. - * Portions Copyright (c) 2017-2019, Chris Fraire . + * Portions Copyright (c) 2017-2020, Chris Fraire . */ package org.opengrok.indexer.analysis.clojure; @@ -57,15 +57,6 @@ public class ClojureAnalyzerFactoryTest { private static TestRepository repository; private static AbstractAnalyzer analyzer; - private static StreamSource getStreamSource(final String fname) { - return new StreamSource() { - @Override - public InputStream getStream() throws IOException { - return new FileInputStream(fname); - } - }; - } - @BeforeClass public static void setUpClass() throws Exception { ctags = new Ctags(); @@ -106,7 +97,7 @@ public void testScopeAnalyzer() throws Exception { string_ft_nstored_nanalyzed_norms)); StringWriter xrefOut = new StringWriter(); analyzer.setCtags(ctags); - analyzer.analyze(doc, getStreamSource(path), xrefOut); + analyzer.analyze(doc, StreamSource.fromFile(f), xrefOut); Definitions definitions = Definitions.deserialize(doc.getField(QueryBuilder.TAGS).binaryValue().bytes); diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/csharp/CSharpAnalyzerFactoryTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/csharp/CSharpAnalyzerFactoryTest.java index 8b2a6b0ecd5..cda1cc09547 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/csharp/CSharpAnalyzerFactoryTest.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/csharp/CSharpAnalyzerFactoryTest.java @@ -19,7 +19,7 @@ /* * Copyright (c) 2015, 2018 Oracle and/or its affiliates. All rights reserved. - * Portions Copyright (c) 2017-2019, Chris Fraire . + * Portions Copyright (c) 2017-2020, Chris Fraire . */ package org.opengrok.indexer.analysis.csharp; @@ -56,15 +56,6 @@ public class CSharpAnalyzerFactoryTest { private static TestRepository repository; private static AbstractAnalyzer analyzer; - private static StreamSource getStreamSource(final String fname) { - return new StreamSource() { - @Override - public InputStream getStream() throws IOException { - return new FileInputStream(fname); - } - }; - } - @BeforeClass public static void setUpClass() throws Exception { ctags = new Ctags(); @@ -105,7 +96,7 @@ public void testScopeAnalyzer() throws Exception { StringWriter xrefOut = new StringWriter(); analyzer.setCtags(ctags); analyzer.setScopesEnabled(true); - analyzer.analyze(doc, getStreamSource(path), xrefOut); + analyzer.analyze(doc, StreamSource.fromFile(f), xrefOut); IndexableField scopesField = doc.getField(QueryBuilder.SCOPES); assertNotNull(scopesField); diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/document/TroffAnalyzerTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/document/TroffAnalyzerTest.java index 48356f2a265..ec67d9798ce 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/document/TroffAnalyzerTest.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/document/TroffAnalyzerTest.java @@ -20,6 +20,7 @@ /* * Copyright (c) 2009, 2018, Oracle and/or its affiliates. All rights reserved. * Portions copyright 2009 - 2011 Jens Elkner. + * Portions Copyright (c) 2020, Chris Fraire . */ package org.opengrok.indexer.analysis.document; @@ -124,6 +125,11 @@ public void testAnalyze() throws IOException { public InputStream getStream() throws IOException { return new ByteArrayInputStream(content.getBytes()); } + + @Override + public String getSourceIdentifier() { + return "String"; + } }, xrefOut); } diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/java/JavaAnalyzerFactoryTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/java/JavaAnalyzerFactoryTest.java index be100a8739e..f4599ff22f5 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/java/JavaAnalyzerFactoryTest.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/java/JavaAnalyzerFactoryTest.java @@ -19,7 +19,7 @@ /* * Copyright (c) 2015, 2018 Oracle and/or its affiliates. All rights reserved. - * Portions Copyright (c) 2017-2019, Chris Fraire . + * Portions Copyright (c) 2017-2020, Chris Fraire . */ package org.opengrok.indexer.analysis.java; @@ -60,15 +60,6 @@ public class JavaAnalyzerFactoryTest { private static TestRepository repository; private static AbstractAnalyzer analyzer; - private static StreamSource getStreamSource(final String fname) { - return new StreamSource() { - @Override - public InputStream getStream() throws IOException { - return new FileInputStream(fname); - } - }; - } - @BeforeClass public static void setUpClass() throws Exception { ctags = new Ctags(); @@ -108,7 +99,7 @@ public void testScopeAnalyzer() throws Exception { StringWriter xrefOut = new StringWriter(); analyzer.setCtags(ctags); analyzer.setScopesEnabled(true); - analyzer.analyze(doc, getStreamSource(path), xrefOut); + analyzer.analyze(doc, StreamSource.fromFile(f), xrefOut); IndexableField scopesField = doc.getField(QueryBuilder.SCOPES); assertNotNull(scopesField); diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/pascal/PascalAnalyzerFactoryTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/pascal/PascalAnalyzerFactoryTest.java index 6021442d69c..f17d80fcec3 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/pascal/PascalAnalyzerFactoryTest.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/pascal/PascalAnalyzerFactoryTest.java @@ -19,7 +19,7 @@ /* * Copyright (c) 2016, 2018 Oracle and/or its affiliates. All rights reserved. - * Portions Copyright (c) 2017-2019, Chris Fraire . + * Portions Copyright (c) 2017-2020, Chris Fraire . */ package org.opengrok.indexer.analysis.pascal; @@ -58,15 +58,6 @@ public class PascalAnalyzerFactoryTest { private static TestRepository repository; private static AbstractAnalyzer analyzer; - private static StreamSource getStreamSource(final String fname) { - return new StreamSource() { - @Override - public InputStream getStream() throws IOException { - return new FileInputStream(fname); - } - }; - } - @BeforeClass public static void setUpClass() throws Exception { ctags = new Ctags(); @@ -108,7 +99,7 @@ public void testAnalyzer() throws Exception { StringWriter xrefOut = new StringWriter(); analyzer.setCtags(ctags); analyzer.setScopesEnabled(true); - analyzer.analyze(doc, getStreamSource(path), xrefOut); + analyzer.analyze(doc, StreamSource.fromFile(f), xrefOut); Definitions definitions = Definitions.deserialize(doc.getField(QueryBuilder.TAGS).binaryValue().bytes); assertNotNull(definitions); diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/plain/DefinitionsTokenStreamTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/plain/DefinitionsTokenStreamTest.java index fcc2d20df9b..e384cdb3865 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/plain/DefinitionsTokenStreamTest.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/plain/DefinitionsTokenStreamTest.java @@ -18,7 +18,7 @@ */ /* - * Copyright (c) 2018, Chris Fraire . + * Copyright (c) 2018, 2020, Chris Fraire . */ package org.opengrok.indexer.analysis.plain; @@ -174,6 +174,11 @@ public InputStream getStream() throws IOException { assertNotNull(name + " as resource,", srcres); return srcres; } + + @Override + public String getSourceIdentifier() { + return name; + } }; } } diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/util/StreamUtils.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/util/StreamUtils.java index b684ed262ef..8726a4d189e 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/util/StreamUtils.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/util/StreamUtils.java @@ -19,7 +19,7 @@ /* * Copyright (c) 2017, 2018 Oracle and/or its affiliates. All rights reserved. - * Portions Copyright (c) 2018-2019, Chris Fraire . + * Portions Copyright (c) 2018-2020, Chris Fraire . */ package org.opengrok.indexer.util; @@ -162,6 +162,11 @@ public InputStream getStream() { assertNotNull("resource " + resourceName, res); return new BufferedInputStream(res); } + + @Override + public String getSourceIdentifier() { + return resourceName; + } }; } From bc1aea96083a7d0b42e99c2ebe709952f6201197 Mon Sep 17 00:00:00 2001 From: Chris Fraire Date: Sat, 25 Apr 2020 18:47:07 -0500 Subject: [PATCH 11/15] Make list.jsp Huge-Text-aware Also, move some logic properly to AnalyzerGuru that had crept into IndexDatabase. --- .../indexer/analysis/AnalyzerGuru.java | 181 +++++++++++++----- .../opengrok/indexer/index/IndexDatabase.java | 61 +----- .../archive/ZipAnalyzerFactoryTest.java | 4 +- .../executables/JarAnalyzerFactoryTest.java | 4 +- .../JavaClassAnalyzerFactoryTest.java | 6 +- .../java/org/opengrok/web/PageConfig.java | 32 ++-- opengrok-web/src/main/webapp/history.jsp | 4 +- opengrok-web/src/main/webapp/list.jsp | 137 +++++++------ opengrok-web/src/main/webapp/mast.jsp | 9 +- opengrok-web/src/main/webapp/minisearch.jspf | 2 +- 10 files changed, 255 insertions(+), 185 deletions(-) diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java index bfdadc77b34..e4b0e86cec0 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/AnalyzerGuru.java @@ -23,8 +23,10 @@ */ package org.opengrok.indexer.analysis; +import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.File; +import java.io.FileInputStream; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; @@ -47,6 +49,7 @@ import java.util.TreeSet; import java.util.logging.Level; import java.util.logging.Logger; + import org.apache.lucene.document.DateTools; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -130,6 +133,13 @@ */ public class AnalyzerGuru { + /** + * A value used as a placeholder for a filename when content is anonymous + * (e.g. from temporary source or from a stream for which an identifier is + * not available). + */ + public static final String ANONYMOUS_NAME = ""; + /** * The maximum number of characters (multi-byte if a BOM is identified) to * read from the input stream to be used for magic string matching. @@ -551,29 +561,92 @@ public static AbstractAnalyzer getAnalyzer(String fileTypeName) { } /** - * Get an analyzer suited to analyze a file. This function will reuse - * analyzers since they are costly. + * Gets an analyzer factory suited to analyze a file, but without a check + * for Huge Text since the file size is not available. * * @param in Input stream containing data to be analyzed - * @param file Name of the file to be analyzed - * @return An analyzer suited for that file content + * @param fileName Name of the file to be analyzed + * @return An analyzer factory suited for that file content * @throws java.io.IOException If an error occurs while accessing the data * in the input stream. */ - public static AbstractAnalyzer getAnalyzer(InputStream in, String file) throws IOException { - AnalyzerFactory factory = find(in, file); + public static AnalyzerFactory getAnalyzerFactory(InputStream in, String fileName) + throws IOException { + AnalyzerFactory factory = find(in, fileName); if (factory == null) { - AbstractAnalyzer defaultAnalyzer = getAnalyzer(); + factory = DEFAULT_ANALYZER_FACTORY; if (LOGGER.isLoggable(Level.FINEST)) { + AbstractAnalyzer defaultAnalyzer = factory.getAnalyzer(); LOGGER.log(Level.FINEST, "{0}: fallback {1}", - new Object[]{file, - defaultAnalyzer.getClass().getSimpleName() }); + new Object[]{fileName, defaultAnalyzer.getClass().getSimpleName()}); } - return defaultAnalyzer; } + return factory; + } + + /** + * Gets an analyzer suited to analyze a file, but without a check for Huge + * Text since the file size is not available. + * + * @param in Input stream containing data to be analyzed + * @param fileName Name of the file to be analyzed + * @return An analyzer factory suited for the file content + * @throws java.io.IOException If an error occurs while accessing the data + * in the input stream. + */ + public static AbstractAnalyzer getAnalyzer(InputStream in, String fileName) + throws IOException { + AnalyzerFactory factory = getAnalyzerFactory(in, fileName); return factory.getAnalyzer(); } + /** + * Gets an analyzer factory suited to analyze a file, with a check for Huge + * Text. + * + * @param file a defined instance to be analyzed + * @param path Name (possibly normalized) of the file to be analyzed + * @param logHugeText a value indicating whether to log if the file is + * identified as Huge Text + * @return An analyzer factory suited for the file content + * @throws java.io.IOException If an error occurs while reading the file + */ + public static AnalyzerFactory getAnalyzerFactory(File file, String path, boolean logHugeText) + throws IOException { + + AnalyzerFactory fac; + try (InputStream in = new BufferedInputStream( + new FileInputStream(file))) { + fac = AnalyzerGuru.getAnalyzerFactory(in, path); + } + + if (AbstractAnalyzer.Genre.PLAIN.equals(fac.getGenre()) && + file.length() >= RuntimeEnvironment.getInstance().getHugeTextThresholdBytes()) { + fac = HugeTextAnalyzerFactory.DEFAULT_INSTANCE; + if (logHugeText && LOGGER.isLoggable(Level.WARNING)) { + String origFileTypeName = fac.getAnalyzer().getFileTypeName(); + LOGGER.log(Level.WARNING, "{0} is huge text: {1}", + new Object[]{origFileTypeName, path}); + } + } + return fac; + } + + /** + * Get an analyzer suited to analyze a file, with a check for Huge Text. + * + * @param file a defined instance to be analyzed + * @param path Name (possibly normalized) of the file to be analyzed + * @param logHugeText a value indicating whether to log if the file is + * identified as Huge Text + * @return An analyzer suited for the file content + * @throws java.io.IOException If an error occurs while reading the file + */ + public static AbstractAnalyzer getAnalyzer(File file, String path, boolean logHugeText) + throws IOException { + return getAnalyzerFactory(file, path, logHugeText).getAnalyzer(); + } + /** * Free resources associated with all registered analyzers. */ @@ -718,24 +791,36 @@ public static void writeDumpedXref(String contextPath, } /** - * Get the genre of a file. + * Get the genre of a file, with a check for Huge Text. * * @param file The file to inspect + * @param fileName name of the file to inspect * @return The genre suitable to decide how to display the file */ - public static AbstractAnalyzer.Genre getGenre(String file) { - return getGenre(find(file)); + public static AbstractAnalyzer.Genre getGenre(File file, String fileName) { + try { + return getGenre(getAnalyzerFactory(file, fileName, true)); + } catch (IOException e) { + LOGGER.log(Level.WARNING, "Error reading {0}", fileName); + return null; + } } /** - * Get the genre of a bulk of data. + * Get the genre of a bulk of data, but without a check for Huge Text since + * the file size is not available. * * @param in A stream containing the data + * @param fileName name of the file to inspect * @return The genre suitable to decide how to display the file - * @throws java.io.IOException If an error occurs while getting the content */ - public static AbstractAnalyzer.Genre getGenre(InputStream in) throws IOException { - return getGenre(find(in)); + public static AbstractAnalyzer.Genre getGenre(InputStream in, String fileName) { + try { + return getGenre(getAnalyzerFactory(in, fileName)); + } catch (IOException e) { + LOGGER.log(Level.WARNING, "Error reading {0}", fileName); + return null; + } } /** @@ -881,13 +966,12 @@ private static AnalyzerFactory findFactory(Class factoryClass) * * * @param in The input stream containing the data - * @param file The file name to get the analyzer for + * @param fileName The file name to get the analyzer for * @return the analyzer factory to use * @throws java.io.IOException If a problem occurs while reading the data */ - public static AnalyzerFactory find(InputStream in, String file) - throws IOException { - AnalyzerFactory factory = find(file); + static AnalyzerFactory find(InputStream in, String fileName) throws IOException { + AnalyzerFactory factory = find(fileName); // TODO above is not that great, since if 2 analyzers share one extension // then only the first one registered will own it // it would be cool if above could return more analyzers and below would @@ -895,17 +979,23 @@ public static AnalyzerFactory find(InputStream in, String file) if (factory != null) { return factory; } - return findForStream(in, file); + return findForStream(in, fileName); } /** - * Finds a suitable analyser class for file name. + * Finds a suitable analyser class for {@code fileName}, which should only + * be used in rare situations, such as for a JAR member or when content is + * not available to support a full determination. + *

To clarify, a full determination as done by + * {@link #getAnalyzerFactory(File, String, boolean)} also reads a bit of + * content as well as inspects file length to determine the ultimate + * analyser. * - * @param file The file name to get the analyzer for + * @param fileName The file name to get the analyzer for * @return the analyzer factory to use */ - public static AnalyzerFactory find(String file) { - String path = file; + public static AnalyzerFactory find(String fileName) { + String path = fileName; int i; // Get basename of the file first. @@ -924,8 +1014,7 @@ public static AnalyzerFactory find(String file) { if (factory != null) { if (LOGGER.isLoggable(Level.FINEST)) { LOGGER.log(Level.FINEST, "{0}: chosen by prefix: {1}", - new Object[]{file, - factory.getClass().getSimpleName() }); + new Object[]{fileName, factory.getClass().getSimpleName()}); } return factory; } @@ -938,8 +1027,7 @@ public static AnalyzerFactory find(String file) { if (factory != null) { if (LOGGER.isLoggable(Level.FINEST)) { LOGGER.log(Level.FINEST, "{0}: chosen by suffix: {1}", - new Object[]{file, - factory.getClass().getSimpleName() }); + new Object[]{fileName, factory.getClass().getSimpleName()}); } return factory; } @@ -957,8 +1045,8 @@ public static AnalyzerFactory find(String file) { * @throws java.io.IOException if an error occurs while reading data from * the stream */ - public static AnalyzerFactory find(InputStream in) throws IOException { - return findForStream(in, ""); + static AnalyzerFactory find(InputStream in) throws IOException { + return findForStream(in, ANONYMOUS_NAME); } /** @@ -966,13 +1054,13 @@ public static AnalyzerFactory find(InputStream in) throws IOException { * corresponding to a file of the specified name. * * @param in The stream containing the data to analyze - * @param file The file name to get the analyzer for + * @param fileName The file name to get the analyzer for * @return the analyzer factory to use * @throws java.io.IOException if an error occurs while reading data from * the stream */ - private static AnalyzerFactory findForStream(InputStream in, - String file) throws IOException { + private static AnalyzerFactory findForStream(InputStream in, String fileName) + throws IOException { in.mark(MAGIC_BYTES_NUM); byte[] content = new byte[MAGIC_BYTES_NUM]; @@ -998,8 +1086,8 @@ private static AnalyzerFactory findForStream(InputStream in, if (fac != null) { if (LOGGER.isLoggable(Level.FINEST)) { LOGGER.log(Level.FINEST, - "{0}: chosen by precise magic: {1}", new Object[]{ - file, fac.getClass().getSimpleName() }); + "{0}: chosen by precise magic: {1}", + new Object[]{fileName, fac.getClass().getSimpleName()}); } return fac; } @@ -1008,7 +1096,7 @@ private static AnalyzerFactory findForStream(InputStream in, // Next, look for magic strings String opening = readOpening(in, content); - fac = findMagicString(opening, file); + fac = findMagicString(opening, fileName); if (fac != null) { return fac; } @@ -1020,9 +1108,8 @@ private static AnalyzerFactory findForStream(InputStream in, if (fac != null) { if (LOGGER.isLoggable(Level.FINEST)) { LOGGER.log(Level.FINEST, - "{0}: chosen by imprecise magic: {1}", - new Object[]{file, - fac.getClass().getSimpleName() }); + "{0}: chosen by imprecise magic: {1}", + new Object[]{fileName, fac.getClass().getSimpleName()}); } return fac; } @@ -1032,7 +1119,7 @@ private static AnalyzerFactory findForStream(InputStream in, return null; } - private static AnalyzerFactory findMagicString(String opening, String file) { + private static AnalyzerFactory findMagicString(String opening, String fileName) { // first, try to look up two words in magics String fragment = getWords(opening, 2); @@ -1040,8 +1127,7 @@ private static AnalyzerFactory findMagicString(String opening, String file) { if (fac != null) { if (LOGGER.isLoggable(Level.FINEST)) { LOGGER.log(Level.FINEST, "{0}: chosen by magic {2}: {1}", - new Object[]{file, fac.getClass().getSimpleName(), - fragment}); + new Object[]{fileName, fac.getClass().getSimpleName(), fragment}); } return fac; } @@ -1052,8 +1138,7 @@ private static AnalyzerFactory findMagicString(String opening, String file) { if (fac != null) { if (LOGGER.isLoggable(Level.FINEST)) { LOGGER.log(Level.FINEST, "{0}: chosen by magic {2}: {1}", - new Object[]{file, fac.getClass().getSimpleName(), - fragment}); + new Object[]{fileName, fac.getClass().getSimpleName(), fragment}); } return fac; } @@ -1066,8 +1151,8 @@ private static AnalyzerFactory findMagicString(String opening, String file) { fac = entry.getValue(); if (LOGGER.isLoggable(Level.FINEST)) { LOGGER.log(Level.FINEST, - "{0}: chosen by magic(substr) {2}: {1}", new Object[]{ - file, fac.getClass().getSimpleName(), magic}); + "{0}: chosen by magic(substr) {2}: {1}", + new Object[]{fileName, fac.getClass().getSimpleName(), magic}); } return fac; } diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexDatabase.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexDatabase.java index 7935ed6f41a..7e395255908 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexDatabase.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexDatabase.java @@ -23,14 +23,11 @@ */ package org.opengrok.indexer.index; -import java.io.BufferedInputStream; import java.io.BufferedWriter; import java.io.File; -import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; -import java.io.InputStream; import java.io.OutputStreamWriter; import java.io.Writer; import java.nio.file.Files; @@ -88,7 +85,6 @@ import org.opengrok.indexer.analysis.AnalyzerGuru; import org.opengrok.indexer.analysis.Ctags; import org.opengrok.indexer.analysis.Definitions; -import org.opengrok.indexer.analysis.data.HugeTextAnalyzerFactory; import org.opengrok.indexer.configuration.PathAccepter; import org.opengrok.indexer.configuration.Project; import org.opengrok.indexer.configuration.RuntimeEnvironment; @@ -119,6 +115,7 @@ public class IndexDatabase { (File p1, File p2) -> p1.getName().compareTo(p2.getName()); private static final Set CHECK_FIELDS; + private static final RuntimeEnvironment env = RuntimeEnvironment.getInstance(); private final Object INSTANCE_LOCK = new Object(); @@ -191,7 +188,7 @@ public IndexDatabase(Project project) throws IOException { */ static CountDownLatch updateAll(IndexChangedListener listener) throws IOException { - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); + List dbs = new ArrayList<>(); if (env.hasProjects()) { @@ -202,8 +199,7 @@ static CountDownLatch updateAll(IndexChangedListener listener) dbs.add(new IndexDatabase()); } - IndexerParallelizer parallelizer = RuntimeEnvironment.getInstance(). - getIndexerParallelizer(); + IndexerParallelizer parallelizer = env.getIndexerParallelizer(); CountDownLatch latch = new CountDownLatch(dbs.size()); for (IndexDatabase d : dbs) { final IndexDatabase db = d; @@ -236,7 +232,6 @@ public void run() { * @param paths list of paths to be indexed */ public static void update(IndexChangedListener listener, List paths) { - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); IndexerParallelizer parallelizer = env.getIndexerParallelizer(); List dbs = new ArrayList<>(); @@ -291,7 +286,6 @@ public void run() { @SuppressWarnings("PMD.CollapsibleIfStatements") private void initialize() throws IOException { synchronized (INSTANCE_LOCK) { - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); File indexDir = new File(env.getDataRootFile(), INDEX_DIR); if (project != null) { indexDir = new File(indexDir, project.getPath()); @@ -331,7 +325,7 @@ public boolean addDirectory(String dir) { } else if (directory.charAt(0) != '/') { directory = "/" + directory; } - File file = new File(RuntimeEnvironment.getInstance().getSourceRootFile(), directory); + File file = new File(env.getSourceRootFile(), directory); if (file.exists()) { directories.add(directory); return true; @@ -340,15 +334,13 @@ public boolean addDirectory(String dir) { } private void showFileCount(String dir, IndexDownArgs args) { - if (RuntimeEnvironment.getInstance().isPrintProgress()) { + if (env.isPrintProgress()) { LOGGER.log(Level.INFO, String.format("Need to process: %d files for %s", args.cur_count, dir)); } } private void markProjectIndexed(Project project) { - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); - // Successfully indexed the project. The message is sent even if // the project's isIndexed() is true because it triggers RepositoryInfo // refresh. @@ -390,8 +382,6 @@ public void update() throws IOException { interrupted = false; } - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); - reader = null; writer = null; settings = null; @@ -534,7 +524,6 @@ public void update() throws IOException { */ static CountDownLatch optimizeAll() throws IOException { List dbs = new ArrayList<>(); - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); IndexerParallelizer parallelizer = env.getIndexerParallelizer(); if (env.hasProjects()) { for (Project project : env.getProjectList()) { @@ -657,7 +646,6 @@ private File whatXrefFile(String path, boolean compress) { * @param path path to file under source root */ private void removeXrefFile(String path) { - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); File xrefFile = whatXrefFile(path, env.isCompressXref()); PendingFileDeletion pending = new PendingFileDeletion( xrefFile.getAbsolutePath()); @@ -707,18 +695,7 @@ private void removeFile(boolean removeHistory) throws IOException { private void addFile(File file, String path, Ctags ctags) throws IOException, InterruptedException { - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); - AbstractAnalyzer fa = getAnalyzerFor(file, path); - - if (AbstractAnalyzer.Genre.PLAIN.equals(fa.getGenre()) && - file.length() >= env.getHugeTextThresholdBytes()) { - String origFileTypeName = fa.getFileTypeName(); - fa = HugeTextAnalyzerFactory.DEFAULT_INSTANCE.getAnalyzer(); - if (LOGGER.isLoggable(Level.WARNING)) { - LOGGER.log(Level.WARNING, "{0} is huge text: {1}", - new Object[]{origFileTypeName, path}); - } - } + AbstractAnalyzer fa = AnalyzerGuru.getAnalyzer(file, path, true); for (IndexChangedListener listener : listeners) { listener.fileAdd(path, fa.getClass().getSimpleName()); @@ -769,14 +746,6 @@ private void addFile(File file, String path, Ctags ctags) } } - private AbstractAnalyzer getAnalyzerFor(File file, String path) - throws IOException { - try (InputStream in = new BufferedInputStream( - new FileInputStream(file))) { - return AnalyzerGuru.getAnalyzer(in, path); - } - } - /** * Do a best effort to clean up all resources allocated when populating * a Lucene document. On normal execution, these resources should be @@ -858,7 +827,6 @@ private boolean accept(File file, AcceptSymlinkRet ret) { } // this is an unversioned file, check if it should be indexed - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); boolean res = !env.isIndexVersionedFilesOnly(); if (!res) { LOGGER.log(Level.FINER, "not accepting unversioned {0}", @@ -927,7 +895,6 @@ private boolean acceptSymlink(Path absolute, File canonical, AcceptSymlinkRet re String absolute1 = absolute.toString(); String canonical1 = canonical.getPath(); boolean isCanonicalDir = canonical.isDirectory(); - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); IndexedSymlink indexed1; String absolute0; @@ -1081,7 +1048,6 @@ private boolean acceptSymlink(Path absolute, File canonical, AcceptSymlinkRet re * @return true if the file is local to the current repository */ private boolean isLocal(String path) { - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); String srcRoot = env.getSourceRootPath(); if (path.startsWith(srcRoot + File.separator)) { @@ -1228,8 +1194,7 @@ private void indexParallel(String dir, IndexDownArgs args) { AtomicInteger successCounter = new AtomicInteger(); AtomicInteger currentCounter = new AtomicInteger(); AtomicInteger alreadyClosedCounter = new AtomicInteger(); - IndexerParallelizer parallelizer = RuntimeEnvironment.getInstance(). - getIndexerParallelizer(); + IndexerParallelizer parallelizer = env.getIndexerParallelizer(); ObjectPool ctagsPool = parallelizer.getCtagsPool(); Map> bySuccess = null; @@ -1351,7 +1316,6 @@ public void addIndexChangedListener(IndexChangedListener listener) { */ public static Set getAllFiles(List subFiles) throws IOException { Set files = new HashSet<>(); - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); if (env.hasProjects()) { if (subFiles == null || subFiles.isEmpty()) { @@ -1451,7 +1415,6 @@ public int getNumFiles() throws IOException { static void listFrequentTokens(List subFiles) throws IOException { final int limit = 4; - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); if (env.hasProjects()) { if (subFiles == null || subFiles.isEmpty()) { for (Project project : env.getProjectList()) { @@ -1517,8 +1480,6 @@ public void listTokens(int freq) throws IOException { */ public static IndexReader getIndexReader(String path) { IndexReader ret = null; - - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); File indexDir = new File(env.getDataRootFile(), INDEX_DIR); if (env.hasProjects()) { @@ -1574,7 +1535,7 @@ public static Definitions getDefinitions(File file) throws ParseException, IOExc */ public static Document getDocument(File file) throws IOException, ParseException { - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); + String path; try { path = env.getPathRelativeToSourceRoot(file); @@ -1648,7 +1609,7 @@ private boolean isXrefWriter(AbstractAnalyzer fa) { */ private Writer newXrefWriter(AbstractAnalyzer fa, String path) throws IOException { - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); + if (env.isGenerateHtml() && isXrefWriter(fa)) { boolean compressed = env.isCompressXref(); File xrefFile = whatXrefFile(path, compressed); @@ -1728,7 +1689,6 @@ private void finishWriting() throws IOException { */ private boolean checkSettings(File file, String path) throws IOException { - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); boolean outIsXrefWriter = false; int reqTabSize = project != null && project.hasTabSizeSetting() ? project.getTabSize() : 0; @@ -1782,7 +1742,7 @@ private boolean checkSettings(File file, String path) throws IOException { */ LOGGER.log(Level.FINER, "Guru version mismatch: {0}", path); - fa = getAnalyzerFor(file, path); + fa = AnalyzerGuru.getAnalyzer(file, path, false); fileTypeName = fa.getFileTypeName(); String oldTypeName = doc.get(QueryBuilder.TYPE); if (!fileTypeName.equals(oldTypeName)) { @@ -1868,7 +1828,6 @@ private IndexAnalysisSettings3 readAnalysisSettings() throws IOException { } private boolean xrefExistsFor(String path) { - RuntimeEnvironment env = RuntimeEnvironment.getInstance(); File xrefFile = whatXrefFile(path, env.isCompressXref()); if (!xrefFile.exists()) { LOGGER.log(Level.FINEST, "Missing {0}", xrefFile); diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/archive/ZipAnalyzerFactoryTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/archive/ZipAnalyzerFactoryTest.java index 12d43697e17..83c75641830 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/archive/ZipAnalyzerFactoryTest.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/archive/ZipAnalyzerFactoryTest.java @@ -18,7 +18,7 @@ */ /* - * Copyright (c) 2018, Chris Fraire . + * Copyright (c) 2018, 2020, Chris Fraire . */ package org.opengrok.indexer.analysis.archive; @@ -47,7 +47,7 @@ public void testZipWrtAnalyzerGuru() throws IOException { assertNotNull("zip.bin should be available,", res); // assert that it is matched - AnalyzerFactory fac = AnalyzerGuru.find(res); + AnalyzerFactory fac = AnalyzerGuru.getAnalyzerFactory(res, AnalyzerGuru.ANONYMOUS_NAME); assertNotNull("zip.bin should have factory", fac); assertSame("should be ZipAnalyzerFactory", fac.getClass(), ZipAnalyzerFactory.class); diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/executables/JarAnalyzerFactoryTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/executables/JarAnalyzerFactoryTest.java index 661cc9bd569..7a03f3f0cd0 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/executables/JarAnalyzerFactoryTest.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/executables/JarAnalyzerFactoryTest.java @@ -18,7 +18,7 @@ */ /* - * Copyright (c) 2018, Chris Fraire . + * Copyright (c) 2018, 2020, Chris Fraire . */ package org.opengrok.indexer.analysis.executables; @@ -47,7 +47,7 @@ public void testJarWrtAnalyzerGuru() throws IOException { assertNotNull("javajar.bin should be available,", res); // assert that it is matched - AnalyzerFactory fac = AnalyzerGuru.find(res); + AnalyzerFactory fac = AnalyzerGuru.getAnalyzerFactory(res, AnalyzerGuru.ANONYMOUS_NAME); assertNotNull("javajar.bin should have factory", fac); assertSame("should be JarAnalyzerFactory", fac.getClass(), JarAnalyzerFactory.class); diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/executables/JavaClassAnalyzerFactoryTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/executables/JavaClassAnalyzerFactoryTest.java index a6123eec18d..c00a02c45a6 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/executables/JavaClassAnalyzerFactoryTest.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/executables/JavaClassAnalyzerFactoryTest.java @@ -18,7 +18,7 @@ */ /* - * Copyright (c) 2017, Chris Fraire . + * Copyright (c) 2017, 2020, Chris Fraire . */ package org.opengrok.indexer.analysis.executables; @@ -48,7 +48,7 @@ public void testJavaClassWrtAnalyzerGuru() throws IOException { assertNotNull("despite inclusion locally,", res); // assert that it is matched - AnalyzerFactory fac = AnalyzerGuru.find(res); + AnalyzerFactory fac = AnalyzerGuru.getAnalyzerFactory(res, AnalyzerGuru.ANONYMOUS_NAME); assertNotNull("javaclass.bin should have factory", fac); assertSame("should be JavaClassAnalyzerFactory", fac.getClass(), JavaClassAnalyzerFactory.class); @@ -64,7 +64,7 @@ public void testDylibCafebabeWrtAnalyzerGuru() throws IOException { "analysis/executables/fat.dylib"); assertNotNull("despite inclusion locally,", res); - AnalyzerFactory fac = AnalyzerGuru.find(res); + AnalyzerFactory fac = AnalyzerGuru.getAnalyzerFactory(res, AnalyzerGuru.ANONYMOUS_NAME); if (fac != null) { assertNotSame("should not be JavaClassAnalyzerFactory", fac.getClass(), JavaClassAnalyzerFactory.class); diff --git a/opengrok-web/src/main/java/org/opengrok/web/PageConfig.java b/opengrok-web/src/main/java/org/opengrok/web/PageConfig.java index f582ef97c3c..e5268d66c90 100644 --- a/opengrok-web/src/main/java/org/opengrok/web/PageConfig.java +++ b/opengrok-web/src/main/java/org/opengrok/web/PageConfig.java @@ -266,7 +266,7 @@ public DiffData getDiffData() { + getUriEncodedPath() + "\">history"; return data; } - data.genre = AnalyzerGuru.getGenre(getResourceFile().getName()); + data.genre = AnalyzerGuru.getGenre(getResourceFile(), getResourceFile().getName()); if (data.genre == null || txtGenres.contains(data.genre)) { InputStream[] in = new InputStream[2]; @@ -298,14 +298,11 @@ public DiffData getDiffData() { * version. */ for (int i = 0; i < 2 && data.genre == null; i++) { - try { - data.genre = AnalyzerGuru.getGenre(in[i]); - } catch (IOException e) { - data.errorMsg = "Unable to determine the file type: " - + Util.htmlize(e.getMessage()); - } + data.genre = AnalyzerGuru.getGenre(in[i], getResourceFile().getName()); + } + if (data.genre == null) { + data.errorMsg = "Unable to determine the file type."; } - if (data.genre != AbstractAnalyzer.Genre.PLAIN && data.genre != AbstractAnalyzer.Genre.HTML) { return data; } @@ -731,14 +728,21 @@ public boolean hasAnnotations() { * * @return {@code true} if annotation is desired and available. */ - public boolean annotate() { + public boolean shouldAnnotate() { if (annotate == null) { - annotate = hasAnnotations() - && Boolean.parseBoolean(req.getParameter(QueryParameters.ANNOTATION_PARAM)); + annotate = wantsAnnotation() && hasAnnotations(); } return annotate; } + /** + * Gets a value indicating if the user submitted an affirmative value for + * the {@link QueryParameters#ANNOTATION_PARAM}. + */ + public boolean wantsAnnotation() { + return Boolean.parseBoolean(req.getParameter(QueryParameters.ANNOTATION_PARAM)); + } + /** * Get the annotation for the requested resource. * @@ -746,7 +750,7 @@ public boolean annotate() { * the cached annotation otherwise. */ public Annotation getAnnotation() { - if (isDir() || getResourcePath().equals("/") || !annotate()) { + if (isDir() || getResourcePath().equals("/") || !shouldAnnotate()) { return null; } if (annotation != null) { @@ -1800,7 +1804,9 @@ public boolean isNotModified(HttpServletRequest request, HttpServletResponse res // last timestamp value getEnv().getDateForLastIndexRun() != null ? getEnv().getDateForLastIndexRun().getTime() : 0, // OpenGrok version has changed since the last time - Info.getVersion() + Info.getVersion(), + // Whether the user indicated to annotate + wantsAnnotation() ) ); diff --git a/opengrok-web/src/main/webapp/history.jsp b/opengrok-web/src/main/webapp/history.jsp index 63238282861..65ab2c52a72 100644 --- a/opengrok-web/src/main/webapp/history.jsp +++ b/opengrok-web/src/main/webapp/history.jsp @@ -1,6 +1,4 @@ <%-- -$Id$ - CDDL HEADER START The contents of this file are subject to the terms of the @@ -111,7 +109,7 @@ include file="httpheader.jspf" diff --git a/opengrok-web/src/main/webapp/list.jsp b/opengrok-web/src/main/webapp/list.jsp index 3bbcf09678b..f7a514c24db 100644 --- a/opengrok-web/src/main/webapp/list.jsp +++ b/opengrok-web/src/main/webapp/list.jsp @@ -40,7 +40,6 @@ java.util.TreeSet, org.opengrok.indexer.analysis.AnalyzerGuru, org.opengrok.indexer.analysis.Ctags, org.opengrok.indexer.analysis.Definitions, -org.opengrok.indexer.analysis.AbstractAnalyzer, org.opengrok.indexer.analysis.AbstractAnalyzer.Genre, org.opengrok.indexer.analysis.AnalyzerFactory, org.opengrok.indexer.history.Annotation, @@ -62,6 +61,8 @@ org.opengrok.indexer.web.SearchHelper" final String DUMMY_REVISION = "unknown"; { + resetData(); + // need to set it here since requesting parameters if (request.getCharacterEncoding() == null) { request.setCharacterEncoding("UTF-8"); @@ -108,14 +109,27 @@ final String DUMMY_REVISION = "unknown"; } } - Annotation annotation = cfg.getAnnotation(); - if (annotation != null) { - int r = annotation.getWidestRevision(); - int a = annotation.getWidestAuthor(); - cfg.addHeaderData(""); + // Set just after the redirects above so that the field is defined early. + project = cfg.getProject(); + + boolean isAnnotatableGenre = false; + if (cfg.wantsAnnotation() && !cfg.isDir()) { + prepareExec(cfg); + if (searchHelper.searcher != null) { + genre = searchHelper.searchSingleGenre(cfg.getResourceFile()); + isAnnotatableGenre = Genre.PLAIN.equals(genre); + } + } + if (isAnnotatableGenre) { + Annotation annotation = cfg.getAnnotation(); + if (annotation != null) { + int r = annotation.getWidestRevision(); + int a = annotation.getWidestAuthor(); + cfg.addHeaderData(""); + } } } %><%@include @@ -132,7 +146,6 @@ document.pageReady.push(function() { pageReadyList();}); PageConfig cfg = PageConfig.get(request); String rev = cfg.getRequestedRevision(); - Project project = cfg.getProject(); String navigateWindowEnabled = project != null ? Boolean.toString( project.isNavigateWindowEnabled()) : "false"; @@ -166,14 +179,8 @@ document.pageReady.push(function() { pageReadyList();}); List files = cfg.getResourceFileList(); if (!files.isEmpty()) { List extras = null; - /* - * N.b. searchHelper.destroy() is called via - * WebappListener.requestDestroyed() on presence of an attribute, - * REQUEST_ATTR, set by the following. - */ - SearchHelper searchHelper = cfg.prepareInternalSearch(); - prepareExec(searchHelper, project); + prepareExec(cfg); if (searchHelper.searcher != null) { DirectoryExtraReader extraReader = new DirectoryExtraReader(); String primePath = path; @@ -230,23 +237,20 @@ document.pageReady.push(function() { pageReadyList();}); File xrefFile; if (cfg.isLatestRevision(rev) && (xrefFile = cfg.findDataFile()) != null) { - if (cfg.annotate()) { + if (cfg.shouldAnnotate()) { // annotate BufferedInputStream bin = new BufferedInputStream(new FileInputStream(resourceFile)); try { - AnalyzerFactory a = AnalyzerGuru.find(basename); - AbstractAnalyzer.Genre g = AnalyzerGuru.getGenre(a); - if (g == null) { - a = AnalyzerGuru.find(bin); - g = AnalyzerGuru.getGenre(a); - } - if (g == AbstractAnalyzer.Genre.IMAGE) { + AnalyzerFactory a = AnalyzerGuru.getAnalyzerFactory( + resourceFile, basename, true); + genre = AnalyzerGuru.getGenre(a); + if (genre == Genre.IMAGE) { %>

Image from Source Repository
<% - } else if ( g == AbstractAnalyzer.Genre.HTML) { + } else if (genre == Genre.HTML) { /** * For backward compatibility, read the OpenGrok-produced * document using the system default charset. @@ -254,7 +258,7 @@ document.pageReady.push(function() { pageReadyList();}); r = new InputStreamReader(bin); // dumpXref() is also useful here for translating links. Util.dumpXref(out, r, request.getContextPath()); - } else if (g == AbstractAnalyzer.Genre.PLAIN) { + } else if (genre == Genre.PLAIN) { %>
<%
@@ -297,23 +301,19 @@ Click download <%= basename %><%
         } else {
             // requesting a previous revision or needed to generate xref on the fly (economy mode).
 
-            AnalyzerFactory a = AnalyzerGuru.find(basename);
-            /*
-             * N.b. searchHelper.destroy() is called via
-             * WebappListener.requestDestroyed() on presence of an attribute,
-             * REQUEST_ATTR, set by the following.
-             */
-            SearchHelper searchHelper = cfg.prepareInternalSearch();
-            prepareExec(searchHelper, project);
-            Genre g = null;
-            if (searchHelper.searcher != null) {
-                g = searchHelper.searchSingleGenre(resourceFile);
+            AnalyzerFactory a = rev.equals(DUMMY_REVISION) ? AnalyzerGuru.getAnalyzerFactory(
+                    resourceFile, path, true) : AnalyzerGuru.find(basename);
+            if (genre == null) {
+                prepareExec(cfg);
+                if (searchHelper.searcher != null) {
+                    genre = searchHelper.searchSingleGenre(resourceFile);
+                }
             }
-            if (g == null) {
-                g = AnalyzerGuru.getGenre(a);
+            if (genre == null) {
+                genre = AnalyzerGuru.getGenre(a);
             }
             String error = null;
-            if (g == Genre.PLAIN || g == Genre.HTML || g == null) {
+            if (genre == Genre.PLAIN || genre == Genre.HTML || genre == null) {
                 InputStream in = null;
                 File tempf = null;
                 try {
@@ -339,11 +339,11 @@ Click download <%= basename %><%
                 }
                 if (in != null) {
                     try {
-                        if (g == null) {
-                            a = AnalyzerGuru.find(in, basename);
-                            g = AnalyzerGuru.getGenre(a);
+                        if (genre == null) {
+                            a = AnalyzerGuru.getAnalyzerFactory(in, basename);
+                            genre = AnalyzerGuru.getGenre(a);
                         }
-                        if (g == AbstractAnalyzer.Genre.DATA || g == AbstractAnalyzer.Genre.XREFABLE || g == null) {
+                        if (genre == Genre.DATA || genre == Genre.XREFABLE || genre == null) {
     %>
     
Download file, ">download <%= basename %><% %>
<%
-                            if (g == AbstractAnalyzer.Genre.PLAIN) {
+                            if (genre == Genre.PLAIN) {
                                 Definitions defs = null;
                                 ObjectPool ctagsPool = cfg.getEnv().
                                         getIndexerParallelizer().getCtagsPool();
@@ -392,11 +392,11 @@ Click download <%= basename %><%
                                         request.getContextPath(),
                                         a, r, out,
                                         defs, annotation, project);
-                            } else if (g == AbstractAnalyzer.Genre.IMAGE) {
+                            } else if (genre == Genre.IMAGE) {
         %>
<%
-                            } else if (g == AbstractAnalyzer.Genre.HTML) {
+                            } else if (genre == Genre.HTML) {
                                 /**
                                  * For backward compatibility, read the
                                  * OpenGrok-produced document using the system
@@ -438,7 +438,7 @@ Click download <%= basename %><%
     

<%= error %>

<% } } - } else if (g == AbstractAnalyzer.Genre.IMAGE) { + } else if (genre == Genre.IMAGE) { %>
download <%= basename %><% %>
<% } else { + AnalyzerFactory fac = AnalyzerGuru.getAnalyzerFactory(resourceFile, path, true); + if (Genre.DATA.equals(fac.getGenre())) { + %> +
+ Download file, <%= basename %> +
<% + } else { %>

Failed to get xref file

<% + } } } } @@ -477,12 +485,31 @@ include file="foot.jspf" %> <%! - private static void prepareExec(SearchHelper searchHelper, Project project) { - if (project != null) { - searchHelper.prepareExec(project); - } else { - //noinspection Convert2Diamond - searchHelper.prepareExec(new TreeSet()); + private Project project; + private SearchHelper searchHelper; + private Genre genre; + + private void resetData() { + project = null; + searchHelper = null; + genre = null; + } + + private void prepareExec(PageConfig cfg) { + if (searchHelper == null) { + /* + * N.b. searchHelper.destroy() is called via + * WebappListener.requestDestroyed() on presence of an attribute, + * REQUEST_ATTR, set by the following. + */ + searchHelper = cfg.prepareInternalSearch(); + + if (project != null) { + searchHelper.prepareExec(project); + } else { + //noinspection Convert2Diamond + searchHelper.prepareExec(new TreeSet()); + } } } %> diff --git a/opengrok-web/src/main/webapp/mast.jsp b/opengrok-web/src/main/webapp/mast.jsp index 8d3463f9031..207ed25c991 100644 --- a/opengrok-web/src/main/webapp/mast.jsp +++ b/opengrok-web/src/main/webapp/mast.jsp @@ -1,6 +1,4 @@ <%-- -$Id$ - CDDL HEADER START The contents of this file are subject to the terms of the @@ -20,12 +18,9 @@ CDDL HEADER END Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. Portions Copyright 2011 Jens Elkner. -Portions Copyright (c) 2018, Chris Fraire . - ---%><%-- +Portions Copyright (c) 2018, 2020, Chris Fraire . After include you are here: /body/div#page/div#content/ - --%> <%@page contentType="text/html; charset=UTF-8" pageEncoding="UTF-8"%> <%@page import="org.opengrok.indexer.web.messages.MessagesContainer"%> @@ -71,7 +66,7 @@ include file="httpheader.jspf" %> diff --git a/opengrok-web/src/main/webapp/minisearch.jspf b/opengrok-web/src/main/webapp/minisearch.jspf index 044a0887ccb..85758708b82 100644 --- a/opengrok-web/src/main/webapp/minisearch.jspf +++ b/opengrok-web/src/main/webapp/minisearch.jspf @@ -45,7 +45,7 @@ org.opengrok.indexer.web.Util"%><% } if (!cfg.hasAnnotations() /* || cfg.getPrefix() == Prefix.HIST_S */ ) { %>
  • Annotate
  • <% - } else if (cfg.annotate()) { + } else if (cfg.shouldAnnotate()) { %>
  • <% @@ -301,7 +301,7 @@ Click download <%= basename %><% } else { // requesting a previous revision or needed to generate xref on the fly (economy mode). - AnalyzerFactory a = rev.equals(DUMMY_REVISION) ? AnalyzerGuru.getAnalyzerFactory( + AnalyzerFactory fac = rev.equals(DUMMY_REVISION) ? AnalyzerGuru.getAnalyzerFactory( resourceFile, path, true) : AnalyzerGuru.find(basename); if (genre == null) { prepareExec(cfg); @@ -310,7 +310,7 @@ Click download <%= basename %><% } } if (genre == null) { - genre = AnalyzerGuru.getGenre(a); + genre = AnalyzerGuru.getGenre(fac); } String error = null; if (genre == Genre.PLAIN || genre == Genre.HTML || genre == null) { @@ -339,9 +339,9 @@ Click download <%= basename %><% } if (in != null) { try { - if (genre == null) { - a = AnalyzerGuru.getAnalyzerFactory(in, basename); - genre = AnalyzerGuru.getGenre(a); + if (fac == null || genre == null) { + fac = AnalyzerGuru.getAnalyzerFactory(in, basename); + genre = AnalyzerGuru.getGenre(fac); } if (genre == Genre.DATA || genre == Genre.XREFABLE || genre == null) { %> @@ -385,12 +385,10 @@ Click download <%= basename %><% Annotation annotation = cfg.getAnnotation(); //not needed yet //annotation.writeTooltipMap(out); - // SRCROOT is read with UTF-8 as a default. + // sourceRoot is read with UTF-8 as a default. r = IOUtils.createBOMStrippedReader(in, - StandardCharsets.UTF_8.name()); - AnalyzerGuru.writeDumpedXref( - request.getContextPath(), - a, r, out, + StandardCharsets.UTF_8.name()); + AnalyzerGuru.writeDumpedXref(request.getContextPath(), fac, r, out, defs, annotation, project); } else if (genre == Genre.IMAGE) { %> From 9c87d35bb4b33835f35860a8437e18d5410f8c4d Mon Sep 17 00:00:00 2001 From: Chris Fraire Date: Fri, 22 May 2020 22:12:50 -0500 Subject: [PATCH 14/15] Fix not properly reanalyzing for Genre.HTML --- opengrok-web/src/main/webapp/list.jsp | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/opengrok-web/src/main/webapp/list.jsp b/opengrok-web/src/main/webapp/list.jsp index 85b03871c87..f194a3f00a8 100644 --- a/opengrok-web/src/main/webapp/list.jsp +++ b/opengrok-web/src/main/webapp/list.jsp @@ -395,17 +395,11 @@ Click download <%= basename %><%
    <%
                                 } else if (genre == Genre.HTML) {
    -                                /**
    -                                 * For backward compatibility, read the
    -                                 * OpenGrok-produced document using the system
    -                                 * default charset.
    -                                 */
    -                                r = new InputStreamReader(in);
    -                                /**
    -                                 * dumpXref() is also useful here for
    -                                 * translating links.
    -                                 */
    -                                Util.dumpXref(out, r, request.getContextPath());
    +                                // sourceRoot is read with UTF-8 as a default.
    +                                r = IOUtils.createBOMStrippedReader(in,
    +                                        StandardCharsets.UTF_8.name());
    +                                AnalyzerGuru.writeDumpedXref(request.getContextPath(), fac, r, out,
    +                                        null, null, project);
                                 } else {
             %>Download file, <%= basename %><%
    
    From 36245a5d19903797931867fd29da71230aaed8fe Mon Sep 17 00:00:00 2001
    From: Chris Fraire 
    Date: Wed, 7 Oct 2020 13:51:52 -0500
    Subject: [PATCH 15/15] Fix String mangled during automatic-merge
    
    ---
     .../src/main/java/org/opengrok/indexer/index/Indexer.java       | 2 +-
     1 file changed, 1 insertion(+), 1 deletion(-)
    
    diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/index/Indexer.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/index/Indexer.java
    index f7e9db95558..e3cc44f4c10 100644
    --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/index/Indexer.java
    +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/index/Indexer.java
    @@ -580,7 +580,7 @@ public static String[] parseOptions(String[] argv) throws ParseException {
     
                 parser.on("--hugeCharacters", "=number", Integer.class,
                         "Limit for number of characters to read and index from a Huge Text data",
    -                    "Assumes --renamedHistory=on").execute(value ->
    +                    "file. Default is 5_000_000.").execute(value ->
                         cfg.setHugeTextLimitCharacters((int) value));
     
                 parser.on("-I", "--include", "=pattern",