Skip to content

Commit

Permalink
New log file using logging heritrix API
Browse files Browse the repository at this point in the history
Target: URI cost policies
  • Loading branch information
cgr71ii committed Mar 23, 2023
1 parent 04f429e commit c9be5b3
Show file tree
Hide file tree
Showing 8 changed files with 137 additions and 55 deletions.
2 changes: 2 additions & 0 deletions commons/src/main/java/org/archive/bdb/BdbModule.java
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.Lifecycle;

// Read DB: https://github.com/opensourceBIM/BIMserver/issues/531#issuecomment-303364364

import com.sleepycat.bind.EntryBinding;
import com.sleepycat.bind.serial.SerialBinding;
import com.sleepycat.bind.serial.StoredClassCatalog;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@

import org.archive.crawler.util.PUC;

import org.archive.crawler.reporting.CrawlerLoggerModule;

import org.springframework.beans.factory.annotation.Autowired;

import org.archive.net.UURI;

import org.archive.modules.CrawlURI;
Expand Down Expand Up @@ -47,22 +51,32 @@ public class PUCCostAssignmentPolicy extends CostAssignmentPolicy implements Has

private static final long serialVersionUID = 1L;

//private static final ConfigPath logFile = new ConfigPath(PUCCostAssignmentPolicy.class.getName(),"${launchId}/logs/cost_puc.log");
private static final Logger logger = Logger.getLogger(PUCCostAssignmentPolicy.class.getName());

protected KeyedProperties kp = new KeyedProperties();
public KeyedProperties getKeyedProperties() {
return kp;
}

protected CrawlerLoggerModule loggerModule;
public CrawlerLoggerModule getLoggerModule() {
return this.loggerModule;
}
@Autowired
public void setLoggerModule(CrawlerLoggerModule loggerModule) {
this.loggerModule = loggerModule;
}

private Logger getLogger() {
return loggerModule.getUriCost();
}

/*
private static void setupLogFile() throws IOException, SecurityException {
logger.setLevel(Level.INFO);
getLogger().setLevel(Level.INFO);
GenerationFileHandler fh = GenerationFileHandler.makeNew(logFile.getFile().getAbsolutePath(), false, false);
logger.addHandler(fh);
logger.setUseParentHandlers(false);
getLogger().addHandler(fh);
getLogger().setUseParentHandlers(false);
}
*/

Expand All @@ -83,8 +97,8 @@ private static void setupLogFile() throws IOException, SecurityException {
try {
setupLogFile();
} catch (Exception e) {
if (logger.isLoggable(Level.WARNING)) {
logger.warning("couldn't setup the log file: " + e.toString());
if (getLogger().isLoggable(Level.WARNING)) {
getLogger().warning("couldn't setup the log file: " + e.toString());
}
}
*/
Expand Down Expand Up @@ -175,13 +189,6 @@ public Boolean getLoggerFine() {
}

public void setLoggerFine(Boolean logger_fine) {
if (logger_fine) {
logger.setLevel(Level.FINE);
}
else {
logger.setLevel(Level.INFO);
}

kp.put("loggerFine", logger_fine);
}

Expand All @@ -193,29 +200,29 @@ public double requestMetric(String src_url, String trg_url, String src_lang, Str
request_param = String.format("src_urls=%s&trg_urls=%s", src_url, trg_url);

if (getUseLanguages()) {
logger.log(Level.WARNING, String.format("Src or trg is empty but languages were expected: %s %s %s %s", src_lang, trg_lang, src_url, trg_url));
getLogger().log(Level.WARNING, String.format("Src or trg is empty but languages were expected: %s %s %s %s", src_lang, trg_lang, src_url, trg_url));
}
}
else {
request_param = String.format("src_urls=%s&trg_urls=%s&src_urls_lang=%s&trg_urls_lang=%s", src_url, trg_url, src_lang, trg_lang);
}

if (logger.isLoggable(Level.FINER)) {
logger.finer("request param: " + request_param);
if (getLogger().isLoggable(Level.FINER)) {
getLogger().finer("request param: " + request_param);
}

try {
String request_result = PUC.sendPOST(getMetricServerUrl(), request_param, getUserAgent());

if (request_result == null) {
logger.log(Level.WARNING, "Request result was null");
getLogger().log(Level.WARNING, "Request result was null");

// Set similarity to minimum value
result = 0.0;
}
else {
if (logger.isLoggable(Level.FINER)) {
logger.finer("request result: " + request_result);
if (getLogger().isLoggable(Level.FINER)) {
getLogger().finer("request result: " + request_result);
}

JSONObject obj = new JSONObject(request_result);
Expand All @@ -224,7 +231,7 @@ public double requestMetric(String src_url, String trg_url, String src_lang, Str
JSONArray scores = obj.getJSONArray("ok");

if (scores.length() != 1) {
logger.log(Level.WARNING, String.format("Unexpected length of scores: %d", scores.length()));
getLogger().log(Level.WARNING, String.format("Unexpected length of scores: %d", scores.length()));

// Set similarity to minimum value
result = 0.0;
Expand All @@ -237,14 +244,14 @@ public double requestMetric(String src_url, String trg_url, String src_lang, Str
try {
String error = obj.getString("err");

logger.log(Level.WARNING, String.format("PUC error: %s", error), e);
getLogger().log(Level.WARNING, String.format("PUC error: %s", error), e);
} catch (JSONException e2) {
logger.log(Level.WARNING, "JSON exception", e2);
getLogger().log(Level.WARNING, "JSON exception", e2);
}
}
}
} catch (Exception e) {
logger.log(Level.WARNING, "Request exception", e);
getLogger().log(Level.WARNING, "Request exception", e);

// Set similarity to minimum value
result = 0.0;
Expand All @@ -257,6 +264,13 @@ public double requestMetric(String src_url, String trg_url, String src_lang, Str
}

public int costOf(CrawlURI curi) {
if (getLoggerFine()) {
getLogger().setLevel(Level.FINE);
}
else {
getLogger().setLevel(Level.INFO);
}

UURI uri = curi.getUURI();
UURI via = curi.getVia();
String str_uri = PUC.removeTrailingSlashes(uri.toCustomString());
Expand All @@ -268,8 +282,8 @@ public int costOf(CrawlURI curi) {
return 1;
}
if (via == null) {
if (logger.isLoggable(Level.FINE)) {
logger.fine(String.format("via is null. uri: (cost: %d) %s", cost, str_uri));
if (getLogger().isLoggable(Level.FINE)) {
getLogger().fine(String.format("via is null. uri: (cost: %d) %s", cost, str_uri));
}

return 1;
Expand All @@ -289,7 +303,7 @@ public int costOf(CrawlURI curi) {
}
if ((!str_uri.startsWith("http://") && !str_uri.startsWith("https://")) ||
(!str_via.startsWith("http://") && !str_via.startsWith("https://"))) {
logger.log(Level.WARNING, String.format("Unexpected URI scheme: %s -> %s", str_via, str_uri));
getLogger().log(Level.WARNING, String.format("Unexpected URI scheme: %s -> %s", str_via, str_uri));

return 150;
}
Expand All @@ -299,8 +313,8 @@ public int costOf(CrawlURI curi) {
// We want via and current URI documents to be HTML, but current URI head hasn't been downloaded yet...
if (!via_curi.getContentType().startsWith("text/html")) {
// The content is not HTML
if (logger.isLoggable(Level.FINE)) {
logger.fine(String.format("Content-Type is not HTML: via uri (via content-type): %s (%s)", str_via, via_curi.getContentType()));
if (getLogger().isLoggable(Level.FINE)) {
getLogger().fine(String.format("Content-Type is not HTML: via uri (via content-type): %s (%s)", str_via, via_curi.getContentType()));
}

return 100;
Expand All @@ -309,16 +323,16 @@ public int costOf(CrawlURI curi) {

String lang_ok = "";
String detected_lang = "";
String uri_domain = PUC.getDomain(str_uri, logger);
String via_domain = PUC.getDomain(str_via, logger);
String uri_domain = PUC.getDomain(str_uri, getLogger());
String via_domain = PUC.getDomain(str_via, getLogger());

// Language
if (getUseLanguages()) {
String lang1 = getLangPreference1();
String lang2 = getLangPreference2();
String[] lang_result;

lang_result = PUC.isLangOk(curi, lang1, lang2, getOnlyReliableDetection(), logger);
lang_result = PUC.isLangOk(curi, lang1, lang2, getOnlyReliableDetection(), getLogger());
lang_ok = lang_result[0];
detected_lang = lang_result[1];

Expand All @@ -333,7 +347,7 @@ else if (lang_ok.equals(lang2)) {
trg_urls_lang = lang1;
}
else {
logger.log(Level.WARNING, String.format("Unexpected languages mismatch: lang1 | lang2 | lang_ok | detected_lang: %s | %s | %s | %s", lang1, lang2, lang_ok, detected_lang));
getLogger().log(Level.WARNING, String.format("Unexpected languages mismatch: lang1 | lang2 | lang_ok | detected_lang: %s | %s | %s | %s", lang1, lang2, lang_ok, detected_lang));
}
}
}
Expand All @@ -357,8 +371,8 @@ else if (getSameDomain() && !uri_domain.equals(via_domain)) {

cost = 100 - (int)(similarity + 0.5) + exploration_value; // [exploration_value, 100 + exploration_value]

if (logger.isLoggable(Level.FINE)) {
logger.fine(String.format("cost | similarity | via (detected lang) -> uri | src_lang - trg_lang: %d | %f | %s (%s) -> %s | %s - %s", cost, similarity, str_via, detected_lang, str_uri, src_urls_lang, trg_urls_lang));
if (getLogger().isLoggable(Level.FINE)) {
getLogger().fine(String.format("cost | similarity | via (detected lang) -> uri | src_lang - trg_lang: %d | %f | %s (%s) -> %s | %s - %s", cost, similarity, str_via, detected_lang, str_uri, src_urls_lang, trg_urls_lang));
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ protected String getCoreKey(UURI basis) {
}

protected String getSurtAuthority(String surt) {
// A URI scheme://domain.tld/path?query has a SURT form of scheme://(tld,domain,)/path?query. (https://heritrix.readthedocs.io/en/stable/glossary.html)
// scheme://(tld,domain,)/path?query -> return "tld,domain,"
int indexOfOpen = surt.indexOf("://(");
int indexOfClose = surt.indexOf(")");
if (indexOfOpen == -1 || indexOfClose == -1
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.crawler.io;

import java.util.logging.Formatter;
import java.util.logging.LogRecord;


/**
* @author Cristian García-Romero
*/
public class UriCostLogFormatter extends Formatter {
public UriCostLogFormatter() {
super();
}

public String format(LogRecord record) {
return record.getMessage() + "\n";
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import org.archive.crawler.io.NonFatalErrorFormatter;
import org.archive.crawler.io.RuntimeErrorFormatter;
import org.archive.crawler.io.StatisticsLogFormatter;
import org.archive.crawler.io.UriCostLogFormatter;
import org.archive.crawler.io.UriErrorFormatter;
import org.archive.crawler.io.UriProcessingFormatter;
import org.archive.crawler.util.Logs;
Expand Down Expand Up @@ -98,6 +99,7 @@ public void setLogExtraInfo(boolean logExtraInfo) {
private static final String LOGNAME_ALERTS = "alerts";
private static final String LOGNAME_PROGRESS_STATISTICS =
"progress-statistics";
private static final String LOGNAME_URI_COST = "uri-cost";
private static final String LOGNAME_URI_ERRORS = "uri-errors";
private static final String LOGNAME_RUNTIME_ERRORS = "runtime-errors";
private static final String LOGNAME_NONFATAL_ERRORS = "nonfatal-errors";
Expand Down Expand Up @@ -129,6 +131,15 @@ public ConfigPath getProgressLogPath() {
public void setProgressLogPath(ConfigPath cp) {
this.progressLogPath.merge(cp);
}

protected ConfigPath uriCostLogPath =
new ConfigPath(Logs.URI_COST.getFilename(),Logs.URI_COST.getFilename());
public ConfigPath getUriCostLogPath() {
return uriCostLogPath;
}
public void setUriCostLogPath(ConfigPath cp) {
this.uriCostLogPath.merge(cp);
}

protected ConfigPath uriErrorsLogPath =
new ConfigPath(Logs.URI_ERRORS.getFilename(),Logs.URI_ERRORS.getFilename());
Expand Down Expand Up @@ -194,6 +205,11 @@ public void setNonfatalErrorsLogPath(ConfigPath cp) {
*/
private transient Logger progressStats;

/**
* URI cost tracker writes here at regular intervals.
*/
private transient Logger uriCost;

/**
* Record of fileHandlers established for loggers,
* assisting file rotation.
Expand Down Expand Up @@ -244,6 +260,7 @@ protected void setupLogs() throws IOException {
uriErrors = Logger.getLogger(LOGNAME_URI_ERRORS + "." + logsPath);
progressStats = Logger.getLogger(LOGNAME_PROGRESS_STATISTICS + "." +
logsPath);
uriCost = Logger.getLogger(LOGNAME_URI_COST + "." + logsPath);

this.fileHandlers = new HashMap<Logger,FileHandler>();
setupLogFile(uriProcessing,
Expand All @@ -266,6 +283,10 @@ protected void setupLogs() throws IOException {
getProgressLogPath().getFile().getAbsolutePath(),
new StatisticsLogFormatter(), true);

setupLogFile(uriCost,
getUriCostLogPath().getFile().getAbsolutePath(),
new UriCostLogFormatter(), true);

setupAlertLog(logsPath);
}

Expand Down Expand Up @@ -418,6 +439,10 @@ public Logger getUriErrors() {
public Logger getUriProcessing() {
return uriProcessing;
}

public Logger getUriCost() {
return uriCost;
}

public int getAlertCount() {
if (atg != null) {
Expand Down Expand Up @@ -459,7 +484,7 @@ private void readObject(ObjectInputStream in)

public void afterPropertiesSet() throws Exception {
ConfigPath[] paths = {
crawlLogPath, alertsLogPath, progressLogPath,
crawlLogPath, alertsLogPath, progressLogPath, uriCostLogPath,
uriErrorsLogPath, runtimeErrorsLogPath, nonfatalErrorsLogPath };
for(ConfigPath cp : paths) {
if(cp.getBase()==null) {
Expand Down
Loading

0 comments on commit c9be5b3

Please sign in to comment.