Skip to content

Commit

Permalink
PUC better logging and decrease max cost
Browse files Browse the repository at this point in the history
  • Loading branch information
cgr71ii committed Sep 25, 2022
1 parent 1ff144a commit d6990e8
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@

import org.archive.spring.HasKeyedProperties;
import org.archive.spring.KeyedProperties;
import org.archive.spring.ConfigPath;

import org.archive.io.GenerationFileHandler;

import org.json.JSONObject;
import org.json.JSONArray;
Expand All @@ -43,27 +46,46 @@
import java.net.URL;

/**
* A CostAssignment policy that uses a constant value of 1 for all CrawlURIs.
* A CostAssignment policy that uses the via and current URI and are
* provided to a classifier through its API and uses the given
* score as inverse cost (1 - score)
*
* @author cgr71ii
*/
public class PUCCostAssignmentPolicy extends CostAssignmentPolicy implements HasKeyedProperties {
//public class PUCCostAssignmentPolicy extends CostAssignmentPolicy {
//public class PUCCostAssignmentPolicy extends CostAssignmentPolicy implements Serializable, HasKeyedProperties {

private static final long serialVersionUID = 1L;

private static final ConfigPath logFile = new ConfigPath(PUCCostAssignmentPolicy.class.getName(),"${launchId}/logs/cost_puc.log");
private static final Logger logger = Logger.getLogger(PUCCostAssignmentPolicy.class.getName());

protected KeyedProperties kp = new KeyedProperties();
public KeyedProperties getKeyedProperties() {
return kp;
}

private static void setupLogFile() throws IOException, SecurityException {
logger.setLevel(Level.INFO);

GenerationFileHandler fh = GenerationFileHandler.makeNew(logFile.getFile().getAbsolutePath(), false, false);

logger.addHandler(fh);
logger.setUseParentHandlers(false);
}

{
setMetricServerUrl("http://localhost:5000/inference");
setUserAgent(String.format("heritrix: %s", PUCCostAssignmentPolicy.class.getName()));
setUrlsBase64(true);
setLoggerFine(false);

try {
setupLogFile();
} catch (Exception e) {
if (logger.isLoggable(Level.WARNING)) {
logger.warning("couldn't setup the log file: " + e.toString());
}
}
}

public String getMetricServerUrl() {
Expand Down Expand Up @@ -200,7 +222,7 @@ public double requestMetric(String src_url, String trg_url) {
}

// The result is expected to be [0, 1]
result = result * 100.0 * 100.0; // [0, 1] -> [0, 100] -> more precission (2 decimals) [0, 10000]
result = result * 100.0; // [0, 1] -> [0, 100]

return result;
}
Expand All @@ -212,7 +234,7 @@ public int costOf(CrawlURI curi) {
UURI uri = curi.getUURI();
UURI via = curi.getVia();
String str_uri = uri.toCustomString();
int cost = 10001;
int cost = 101;

if (via != null) {
String str_via = via.toCustomString();
Expand All @@ -222,10 +244,10 @@ public int costOf(CrawlURI curi) {
str_via = Base64.getEncoder().encodeToString(str_via.getBytes());
}

// Metric should be a value in [0, 10000]
// Metric should be a value in [0, 100]
double similarity = requestMetric(str_uri, str_via);

cost = 10000 - (int)similarity + 1; // [1, 10001]
cost = 100 - (int)similarity + 1; // [1, 101]

if (logger.isLoggable(Level.FINE)) {
logger.fine(String.format("cost<tab>similarity<tab>via<tab>uri: %d\t%f\t%s\t%s", cost, similarity, str_via, str_uri));
Expand Down
2 changes: 1 addition & 1 deletion install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ if [[ ! -z "$HERITRIX_SKIP_TESTS" ]]; then
if [[ "$HERITRIX_SKIP_TESTS" == "true" ]]; then
HERITRIX_SKIP_TESTS="-DskipTests"
elif [[ "$HERITRIX_SKIP_TESTS" == "false" ]]; then
HERITRIX_SKIP_TESTS = ""
HERITRIX_SKIP_TESTS=""
else
>&2 echo "skip-tests only allows 'true' and 'false'"

Expand Down

0 comments on commit d6990e8

Please sign in to comment.