diff --git a/commons/src/main/java/org/archive/bdb/BdbModule.java b/commons/src/main/java/org/archive/bdb/BdbModule.java index ada7ba0e4..ea9ad59ba 100644 --- a/commons/src/main/java/org/archive/bdb/BdbModule.java +++ b/commons/src/main/java/org/archive/bdb/BdbModule.java @@ -53,6 +53,8 @@ import org.springframework.beans.factory.annotation.Autowired; import org.springframework.context.Lifecycle; +// Read DB: https://github.com/opensourceBIM/BIMserver/issues/531#issuecomment-303364364 + import com.sleepycat.bind.EntryBinding; import com.sleepycat.bind.serial.SerialBinding; import com.sleepycat.bind.serial.StoredClassCatalog; diff --git a/engine/src/main/java/org/archive/crawler/frontier/PUCCostAssignmentPolicy.java b/engine/src/main/java/org/archive/crawler/frontier/PUCCostAssignmentPolicy.java index acc01c6c9..d0f116e1a 100644 --- a/engine/src/main/java/org/archive/crawler/frontier/PUCCostAssignmentPolicy.java +++ b/engine/src/main/java/org/archive/crawler/frontier/PUCCostAssignmentPolicy.java @@ -20,6 +20,10 @@ import org.archive.crawler.util.PUC; +import org.archive.crawler.reporting.CrawlerLoggerModule; + +import org.springframework.beans.factory.annotation.Autowired; + import org.archive.net.UURI; import org.archive.modules.CrawlURI; @@ -47,22 +51,32 @@ public class PUCCostAssignmentPolicy extends CostAssignmentPolicy implements Has private static final long serialVersionUID = 1L; - //private static final ConfigPath logFile = new ConfigPath(PUCCostAssignmentPolicy.class.getName(),"${launchId}/logs/cost_puc.log"); - private static final Logger logger = Logger.getLogger(PUCCostAssignmentPolicy.class.getName()); - protected KeyedProperties kp = new KeyedProperties(); public KeyedProperties getKeyedProperties() { return kp; } + protected CrawlerLoggerModule loggerModule; + public CrawlerLoggerModule getLoggerModule() { + return this.loggerModule; + } + @Autowired + public void setLoggerModule(CrawlerLoggerModule loggerModule) { + this.loggerModule = loggerModule; + } + + private Logger getLogger() { + return loggerModule.getUriCost(); + } + /* private static void setupLogFile() throws IOException, SecurityException { - logger.setLevel(Level.INFO); + getLogger().setLevel(Level.INFO); GenerationFileHandler fh = GenerationFileHandler.makeNew(logFile.getFile().getAbsolutePath(), false, false); - logger.addHandler(fh); - logger.setUseParentHandlers(false); + getLogger().addHandler(fh); + getLogger().setUseParentHandlers(false); } */ @@ -83,8 +97,8 @@ private static void setupLogFile() throws IOException, SecurityException { try { setupLogFile(); } catch (Exception e) { - if (logger.isLoggable(Level.WARNING)) { - logger.warning("couldn't setup the log file: " + e.toString()); + if (getLogger().isLoggable(Level.WARNING)) { + getLogger().warning("couldn't setup the log file: " + e.toString()); } } */ @@ -175,13 +189,6 @@ public Boolean getLoggerFine() { } public void setLoggerFine(Boolean logger_fine) { - if (logger_fine) { - logger.setLevel(Level.FINE); - } - else { - logger.setLevel(Level.INFO); - } - kp.put("loggerFine", logger_fine); } @@ -193,29 +200,29 @@ public double requestMetric(String src_url, String trg_url, String src_lang, Str request_param = String.format("src_urls=%s&trg_urls=%s", src_url, trg_url); if (getUseLanguages()) { - logger.log(Level.WARNING, String.format("Src or trg is empty but languages were expected: %s %s %s %s", src_lang, trg_lang, src_url, trg_url)); + getLogger().log(Level.WARNING, String.format("Src or trg is empty but languages were expected: %s %s %s %s", src_lang, trg_lang, src_url, trg_url)); } } else { request_param = String.format("src_urls=%s&trg_urls=%s&src_urls_lang=%s&trg_urls_lang=%s", src_url, trg_url, src_lang, trg_lang); } - if (logger.isLoggable(Level.FINER)) { - logger.finer("request param: " + request_param); + if (getLogger().isLoggable(Level.FINER)) { + getLogger().finer("request param: " + request_param); } try { String request_result = PUC.sendPOST(getMetricServerUrl(), request_param, getUserAgent()); if (request_result == null) { - logger.log(Level.WARNING, "Request result was null"); + getLogger().log(Level.WARNING, "Request result was null"); // Set similarity to minimum value result = 0.0; } else { - if (logger.isLoggable(Level.FINER)) { - logger.finer("request result: " + request_result); + if (getLogger().isLoggable(Level.FINER)) { + getLogger().finer("request result: " + request_result); } JSONObject obj = new JSONObject(request_result); @@ -224,7 +231,7 @@ public double requestMetric(String src_url, String trg_url, String src_lang, Str JSONArray scores = obj.getJSONArray("ok"); if (scores.length() != 1) { - logger.log(Level.WARNING, String.format("Unexpected length of scores: %d", scores.length())); + getLogger().log(Level.WARNING, String.format("Unexpected length of scores: %d", scores.length())); // Set similarity to minimum value result = 0.0; @@ -237,14 +244,14 @@ public double requestMetric(String src_url, String trg_url, String src_lang, Str try { String error = obj.getString("err"); - logger.log(Level.WARNING, String.format("PUC error: %s", error), e); + getLogger().log(Level.WARNING, String.format("PUC error: %s", error), e); } catch (JSONException e2) { - logger.log(Level.WARNING, "JSON exception", e2); + getLogger().log(Level.WARNING, "JSON exception", e2); } } } } catch (Exception e) { - logger.log(Level.WARNING, "Request exception", e); + getLogger().log(Level.WARNING, "Request exception", e); // Set similarity to minimum value result = 0.0; @@ -257,6 +264,13 @@ public double requestMetric(String src_url, String trg_url, String src_lang, Str } public int costOf(CrawlURI curi) { + if (getLoggerFine()) { + getLogger().setLevel(Level.FINE); + } + else { + getLogger().setLevel(Level.INFO); + } + UURI uri = curi.getUURI(); UURI via = curi.getVia(); String str_uri = PUC.removeTrailingSlashes(uri.toCustomString()); @@ -268,8 +282,8 @@ public int costOf(CrawlURI curi) { return 1; } if (via == null) { - if (logger.isLoggable(Level.FINE)) { - logger.fine(String.format("via is null. uri: (cost: %d) %s", cost, str_uri)); + if (getLogger().isLoggable(Level.FINE)) { + getLogger().fine(String.format("via is null. uri: (cost: %d) %s", cost, str_uri)); } return 1; @@ -289,7 +303,7 @@ public int costOf(CrawlURI curi) { } if ((!str_uri.startsWith("http://") && !str_uri.startsWith("https://")) || (!str_via.startsWith("http://") && !str_via.startsWith("https://"))) { - logger.log(Level.WARNING, String.format("Unexpected URI scheme: %s -> %s", str_via, str_uri)); + getLogger().log(Level.WARNING, String.format("Unexpected URI scheme: %s -> %s", str_via, str_uri)); return 150; } @@ -299,8 +313,8 @@ public int costOf(CrawlURI curi) { // We want via and current URI documents to be HTML, but current URI head hasn't been downloaded yet... if (!via_curi.getContentType().startsWith("text/html")) { // The content is not HTML - if (logger.isLoggable(Level.FINE)) { - logger.fine(String.format("Content-Type is not HTML: via uri (via content-type): %s (%s)", str_via, via_curi.getContentType())); + if (getLogger().isLoggable(Level.FINE)) { + getLogger().fine(String.format("Content-Type is not HTML: via uri (via content-type): %s (%s)", str_via, via_curi.getContentType())); } return 100; @@ -309,8 +323,8 @@ public int costOf(CrawlURI curi) { String lang_ok = ""; String detected_lang = ""; - String uri_domain = PUC.getDomain(str_uri, logger); - String via_domain = PUC.getDomain(str_via, logger); + String uri_domain = PUC.getDomain(str_uri, getLogger()); + String via_domain = PUC.getDomain(str_via, getLogger()); // Language if (getUseLanguages()) { @@ -318,7 +332,7 @@ public int costOf(CrawlURI curi) { String lang2 = getLangPreference2(); String[] lang_result; - lang_result = PUC.isLangOk(curi, lang1, lang2, getOnlyReliableDetection(), logger); + lang_result = PUC.isLangOk(curi, lang1, lang2, getOnlyReliableDetection(), getLogger()); lang_ok = lang_result[0]; detected_lang = lang_result[1]; @@ -333,7 +347,7 @@ else if (lang_ok.equals(lang2)) { trg_urls_lang = lang1; } else { - logger.log(Level.WARNING, String.format("Unexpected languages mismatch: lang1 | lang2 | lang_ok | detected_lang: %s | %s | %s | %s", lang1, lang2, lang_ok, detected_lang)); + getLogger().log(Level.WARNING, String.format("Unexpected languages mismatch: lang1 | lang2 | lang_ok | detected_lang: %s | %s | %s | %s", lang1, lang2, lang_ok, detected_lang)); } } } @@ -357,8 +371,8 @@ else if (getSameDomain() && !uri_domain.equals(via_domain)) { cost = 100 - (int)(similarity + 0.5) + exploration_value; // [exploration_value, 100 + exploration_value] - if (logger.isLoggable(Level.FINE)) { - logger.fine(String.format("cost | similarity | via (detected lang) -> uri | src_lang - trg_lang: %d | %f | %s (%s) -> %s | %s - %s", cost, similarity, str_via, detected_lang, str_uri, src_urls_lang, trg_urls_lang)); + if (getLogger().isLoggable(Level.FINE)) { + getLogger().fine(String.format("cost | similarity | via (detected lang) -> uri | src_lang - trg_lang: %d | %f | %s (%s) -> %s | %s - %s", cost, similarity, str_via, detected_lang, str_uri, src_urls_lang, trg_urls_lang)); } } diff --git a/engine/src/main/java/org/archive/crawler/frontier/SurtAuthorityQueueAssignmentPolicy.java b/engine/src/main/java/org/archive/crawler/frontier/SurtAuthorityQueueAssignmentPolicy.java index 07178214a..2f4d0b3a4 100644 --- a/engine/src/main/java/org/archive/crawler/frontier/SurtAuthorityQueueAssignmentPolicy.java +++ b/engine/src/main/java/org/archive/crawler/frontier/SurtAuthorityQueueAssignmentPolicy.java @@ -35,6 +35,8 @@ protected String getCoreKey(UURI basis) { } protected String getSurtAuthority(String surt) { + // A URI scheme://domain.tld/path?query has a SURT form of scheme://(tld,domain,)/path?query. (https://heritrix.readthedocs.io/en/stable/glossary.html) + // scheme://(tld,domain,)/path?query -> return "tld,domain," int indexOfOpen = surt.indexOf("://("); int indexOfClose = surt.indexOf(")"); if (indexOfOpen == -1 || indexOfClose == -1 diff --git a/engine/src/main/java/org/archive/crawler/io/UriCostLogFormatter.java b/engine/src/main/java/org/archive/crawler/io/UriCostLogFormatter.java new file mode 100644 index 000000000..f2ba8ff68 --- /dev/null +++ b/engine/src/main/java/org/archive/crawler/io/UriCostLogFormatter.java @@ -0,0 +1,36 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.crawler.io; + +import java.util.logging.Formatter; +import java.util.logging.LogRecord; + + +/** + * @author Cristian GarcĂ­a-Romero + */ +public class UriCostLogFormatter extends Formatter { + public UriCostLogFormatter() { + super(); + } + + public String format(LogRecord record) { + return record.getMessage() + "\n"; + } +} diff --git a/engine/src/main/java/org/archive/crawler/reporting/CrawlerLoggerModule.java b/engine/src/main/java/org/archive/crawler/reporting/CrawlerLoggerModule.java index 95c00e4aa..b006a5b9d 100644 --- a/engine/src/main/java/org/archive/crawler/reporting/CrawlerLoggerModule.java +++ b/engine/src/main/java/org/archive/crawler/reporting/CrawlerLoggerModule.java @@ -37,6 +37,7 @@ import org.archive.crawler.io.NonFatalErrorFormatter; import org.archive.crawler.io.RuntimeErrorFormatter; import org.archive.crawler.io.StatisticsLogFormatter; +import org.archive.crawler.io.UriCostLogFormatter; import org.archive.crawler.io.UriErrorFormatter; import org.archive.crawler.io.UriProcessingFormatter; import org.archive.crawler.util.Logs; @@ -98,6 +99,7 @@ public void setLogExtraInfo(boolean logExtraInfo) { private static final String LOGNAME_ALERTS = "alerts"; private static final String LOGNAME_PROGRESS_STATISTICS = "progress-statistics"; + private static final String LOGNAME_URI_COST = "uri-cost"; private static final String LOGNAME_URI_ERRORS = "uri-errors"; private static final String LOGNAME_RUNTIME_ERRORS = "runtime-errors"; private static final String LOGNAME_NONFATAL_ERRORS = "nonfatal-errors"; @@ -129,6 +131,15 @@ public ConfigPath getProgressLogPath() { public void setProgressLogPath(ConfigPath cp) { this.progressLogPath.merge(cp); } + + protected ConfigPath uriCostLogPath = + new ConfigPath(Logs.URI_COST.getFilename(),Logs.URI_COST.getFilename()); + public ConfigPath getUriCostLogPath() { + return uriCostLogPath; + } + public void setUriCostLogPath(ConfigPath cp) { + this.uriCostLogPath.merge(cp); + } protected ConfigPath uriErrorsLogPath = new ConfigPath(Logs.URI_ERRORS.getFilename(),Logs.URI_ERRORS.getFilename()); @@ -194,6 +205,11 @@ public void setNonfatalErrorsLogPath(ConfigPath cp) { */ private transient Logger progressStats; + /** + * URI cost tracker writes here at regular intervals. + */ + private transient Logger uriCost; + /** * Record of fileHandlers established for loggers, * assisting file rotation. @@ -244,6 +260,7 @@ protected void setupLogs() throws IOException { uriErrors = Logger.getLogger(LOGNAME_URI_ERRORS + "." + logsPath); progressStats = Logger.getLogger(LOGNAME_PROGRESS_STATISTICS + "." + logsPath); + uriCost = Logger.getLogger(LOGNAME_URI_COST + "." + logsPath); this.fileHandlers = new HashMap(); setupLogFile(uriProcessing, @@ -266,6 +283,10 @@ protected void setupLogs() throws IOException { getProgressLogPath().getFile().getAbsolutePath(), new StatisticsLogFormatter(), true); + setupLogFile(uriCost, + getUriCostLogPath().getFile().getAbsolutePath(), + new UriCostLogFormatter(), true); + setupAlertLog(logsPath); } @@ -418,6 +439,10 @@ public Logger getUriErrors() { public Logger getUriProcessing() { return uriProcessing; } + + public Logger getUriCost() { + return uriCost; + } public int getAlertCount() { if (atg != null) { @@ -459,7 +484,7 @@ private void readObject(ObjectInputStream in) public void afterPropertiesSet() throws Exception { ConfigPath[] paths = { - crawlLogPath, alertsLogPath, progressLogPath, + crawlLogPath, alertsLogPath, progressLogPath, uriCostLogPath, uriErrorsLogPath, runtimeErrorsLogPath, nonfatalErrorsLogPath }; for(ConfigPath cp : paths) { if(cp.getBase()==null) { diff --git a/engine/src/main/java/org/archive/crawler/util/Logs.java b/engine/src/main/java/org/archive/crawler/util/Logs.java index 3aa08b3e1..8ac97718d 100644 --- a/engine/src/main/java/org/archive/crawler/util/Logs.java +++ b/engine/src/main/java/org/archive/crawler/util/Logs.java @@ -1,21 +1,21 @@ -/* - * This file is part of the Heritrix web crawler (crawler.archive.org). - * - * Licensed to the Internet Archive (IA) by one or more individual - * contributors. - * - * The IA licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.archive.crawler.util; @@ -29,6 +29,7 @@ public enum Logs{ CRAWL ("crawl.log"), ALERTS ("alerts.log"), PROGRESS_STATISTICS ("progress-statistics.log"), + URI_COST ("uri-cost.log"), RUNTIME_ERRORS ("runtime-errors.log"), NONFATAL_ERRORS ("nonfatal-errors.log"), URI_ERRORS ("uri-errors.log"); diff --git a/engine/src/main/resources/org/archive/crawler/restlet/profiles/profile-crawler-beans.cxml b/engine/src/main/resources/org/archive/crawler/restlet/profiles/profile-crawler-beans.cxml index 5ed606b31..e84abefc3 100644 --- a/engine/src/main/resources/org/archive/crawler/restlet/profiles/profile-crawler-beans.cxml +++ b/engine/src/main/resources/org/archive/crawler/restlet/profiles/profile-crawler-beans.cxml @@ -683,6 +683,7 @@ http://example.example/example + diff --git a/engine/src/main/resources/org/archive/crawler/restlet/profiles/profile-only-text.cxml b/engine/src/main/resources/org/archive/crawler/restlet/profiles/profile-only-text.cxml index 53efe02f0..f9b8e9752 100644 --- a/engine/src/main/resources/org/archive/crawler/restlet/profiles/profile-only-text.cxml +++ b/engine/src/main/resources/org/archive/crawler/restlet/profiles/profile-only-text.cxml @@ -741,6 +741,7 @@ http://example.example/example +