Skip to content

Extracting base classes for analyzer factory and analyzers #2658

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 4, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* See LICENSE.txt included in this distribution for the specific
* language governing permissions and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at LICENSE.txt.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/

/*
* Copyright (c) 2019 Oracle and/or its affiliates. All rights reserved.
*/
package org.opengrok.indexer.analysis;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.io.Writer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.opengrok.indexer.configuration.Project;

public abstract class AbstractAnalyzer extends Analyzer {
public static final Reader DUMMY_READER = new StringReader("");
protected AnalyzerFactory factory;
// you analyzer HAS to override this to get proper symbols in results
protected JFlexTokenizer symbolTokenizer;
protected Project project;
protected Ctags ctags;
protected boolean scopesEnabled;
protected boolean foldingEnabled;

public AbstractAnalyzer(ReuseStrategy reuseStrategy) {
super(reuseStrategy);
}

public abstract long getVersionNo();

/**
* Subclasses should override to produce a value relevant for the evolution
* of their analysis in each release.
*
* @return 0
*/
protected int getSpecializedVersionNo() {
return 0; // FileAnalyzer is not specialized.
}

public void setCtags(Ctags ctags) {
this.ctags = ctags;
}

public void setProject(Project project) {
this.project = project;
}

public void setScopesEnabled(boolean scopesEnabled) {
this.scopesEnabled = supportsScopes() && scopesEnabled;
}

public void setFoldingEnabled(boolean foldingEnabled) {
this.foldingEnabled = supportsScopes() && foldingEnabled;
}

protected abstract boolean supportsScopes();

/**
* Get the factory which created this analyzer.
*
* @return the {@code FileAnalyzerFactory} which created this analyzer
*/
public final AnalyzerFactory getFactory() {
return factory;
}

public AbstractAnalyzer.Genre getGenre() {
return factory.getGenre();
}

public abstract String getFileTypeName();

public abstract void analyze(Document doc, StreamSource src, Writer xrefOut)
throws IOException, InterruptedException;

public abstract Xrefer writeXref(WriteXrefArgs args) throws IOException;

@Override
protected abstract TokenStreamComponents createComponents(String fieldName);

protected abstract void addNumLines(Document doc, int value);

protected abstract void addLOC(Document doc, int value);

@Override
protected abstract TokenStream normalize(String fieldName, TokenStream in);

/**
* What kind of file is this?
*/
public enum Genre {
/**
* xrefed - line numbered context
*/
PLAIN("p"),
/**
* xrefed - summarizer context
*/
XREFABLE("x"),
/**
* not xrefed - no context - used by diff/list
*/
IMAGE("i"),
/**
* not xrefed - no context
*/
DATA("d"),
/**
* not xrefed - summarizer context from original file
*/
HTML("h");
private final String typeName;

Genre(String typename) {
this.typeName = typename;
}

/**
* Get the type name value used to tag lucene documents.
*
* @return a none-null string.
*/
public String typeName() {
return typeName;
}

/**
* Get the Genre for the given type name.
*
* @param typeName name to check
* @return {@code null} if it doesn't match any genre, the genre
* otherwise.
* @see #typeName()
*/
public static Genre get(String typeName) {
if (typeName == null) {
return null;
}
for (Genre g : values()) {
if (g.typeName.equals(typeName)) {
return g;
}
}
return null;
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* See LICENSE.txt included in this distribution for the specific
* language governing permissions and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at LICENSE.txt.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/

/*
* Copyright (c) 2019 Oracle and/or its affiliates. All rights reserved.
*/
package org.opengrok.indexer.analysis;

import java.util.Collections;
import java.util.List;

public abstract class AnalyzerFactory {
/**
* Cached analyzer object for the current thread (analyzer objects can be
* expensive to allocate).
*/
protected final ThreadLocal<AbstractAnalyzer> cachedAnalyzer;
/**
* List of file names on which this kind of analyzer should be used.
*/
protected List<String> names;
/**
* List of file prefixes on which this kind of analyzer should be
* used.
*/
protected List<String> prefixes;
/**
* List of file extensions on which this kind of analyzer should be
* used.
*/
protected List<String> suffixes;
/**
* List of magic strings used to recognize files on which this kind of
* analyzer should be used.
*/
protected List<String> magics;
/**
* List of matchers which delegate files to different types of
* analyzers.
*/
protected final List<FileAnalyzerFactory.Matcher> matchers;
/**
* The content type for the files recognized by this kind of analyzer.
*/
protected final String contentType;
/**
* The genre for files recognized by this kind of analyzer.
*/
protected AbstractAnalyzer.Genre genre;

public AnalyzerFactory(FileAnalyzerFactory.Matcher matcher, String contentType) {
cachedAnalyzer = new ThreadLocal<>();
if (matcher == null) {
this.matchers = Collections.emptyList();
} else {
this.matchers = Collections.singletonList(matcher);
}
this.contentType = contentType;
}

/**
* Get the list of file names recognized by this analyzer (names must
* match exactly, ignoring case).
*
* @return list of file names
*/
final List<String> getFileNames() {
return names;
}

/**
* Get the list of file prefixes recognized by this analyzer.
*
* @return list of prefixes
*/
final List<String> getPrefixes() {
return prefixes;
}

/**
* Get the list of file extensions recognized by this analyzer.
*
* @return list of suffixes
*/
final List<String> getSuffixes() {
return suffixes;
}

/**
* Get the list of magic strings recognized by this analyzer. If a file
* starts with one of these strings, an analyzer created by this factory
* should be used to analyze it.
*
* <p><b>Note:</b> Currently this assumes that the file is encoded with
* UTF-8 unless a BOM is detected.
*
* @return list of magic strings
*/
final List<String> getMagicStrings() {
return magics;
}

/**
* Get matchers that map file contents to analyzer factories
* programmatically.
*
* @return list of matchers
*/
final List<FileAnalyzerFactory.Matcher> getMatchers() {
return matchers;
}

/**
* Get the content type (MIME type) for analyzers returned by this factory.
*
* @return content type (could be {@code null} if it is unknown)
*/
final String getContentType() {
return contentType;
}

/**
* The genre this analyzer factory belongs to.
*
* @return a genre
*/
public final AbstractAnalyzer.Genre getGenre() {
return genre;
}

/**
* The user friendly name of this analyzer
*
* @return a genre
*/
public abstract String getName();

public abstract AbstractAnalyzer getAnalyzer();

public abstract void returnAnalyzer();

protected abstract AbstractAnalyzer newAnalyzer();
}
Loading