, Content, Text, ParseImpl>.Contex
Configuration conf = context.getConfiguration();
scfilters = new ScoringFilters(conf);
skipTruncated = conf.getBoolean(SKIP_TRUNCATED, true);
+ deleteFailedParse = conf.getBoolean(DELETE_FAILED_PARSE, false);
parseLatencyTracker = new LatencyTracker(
NutchMetrics.GROUP_PARSER, NutchMetrics.PARSER_LATENCY);
// Initialize error tracker with cached counters
@@ -119,6 +127,9 @@ public void map(WritableComparable> key, Content content,
// no fetch status, skip document
LOG.debug("Skipping {} as content has no fetch status", key);
return;
+ } else if(deleteFailedParse && Integer.parseInt(fetchStatus) == CrawlDatum.STATUS_PARSE_FAILED) {
+ LOG.debug("Skipping {} as un-parseable content will be deleted", key);
+ return;
} else if (Integer.parseInt(fetchStatus) != CrawlDatum.STATUS_FETCH_SUCCESS) {
// content not fetched successfully, skip document
LOG.debug("Skipping {} as content is not fetched successfully", key);
@@ -126,6 +137,7 @@ public void map(WritableComparable> key, Content content,
}
if (skipTruncated && isTruncated(content)) {
+ LOG.debug("Skipping {} as content is truncated", key);
return;
}
diff --git a/src/java/org/apache/nutch/parse/ParseStatus.java b/src/java/org/apache/nutch/parse/ParseStatus.java
index 25b8ae1b47..5d4e846800 100644
--- a/src/java/org/apache/nutch/parse/ParseStatus.java
+++ b/src/java/org/apache/nutch/parse/ParseStatus.java
@@ -188,6 +188,10 @@ public void write(DataOutput out) throws IOException {
public boolean isSuccess() {
return majorCode == SUCCESS;
}
+
+ public boolean isFailed() {
+ return majorCode == FAILED;
+ }
/**
* @return a String representation of the first argument,
diff --git a/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java b/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
index d11634bbd1..547c373f1f 100644
--- a/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
+++ b/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
@@ -500,4 +500,33 @@ private static String probeContentType(java.nio.file.Path file)
String ct = Files.probeContentType(file);
return ct != null ? ct : "application/octet-stream";
}
+
+ /**
+ * Creates a new JettyServer with one static root context and the provided resource handler.
+ *
+ * @param port
+ * port to listen to
+ * @param staticContent
+ * folder where static content lives
+ * @param resourceHandler
+ * resource handler to override the default behavior if needed.
+ * @return configured Jetty server instance
+ * @throws UnknownHostException
+ */
+ @NonNull
+ public static Server getServer(int port, @NonNull String staticContent, ResourceHandler resourceHandler)
+ throws UnknownHostException {
+ Server webServer = new Server();
+
+ ServerConnector listener = new ServerConnector(webServer);
+ listener.setPort(port);
+ listener.setHost("127.0.0.1");
+ webServer.addConnector(listener);
+ ContextHandler staticContext = new ContextHandler();
+ staticContext.setContextPath("/");
+ staticContext.setResourceBase(staticContent);
+ staticContext.insertHandler(resourceHandler);
+ webServer.insertHandler(staticContext);
+ return webServer;
+ }
}
diff --git a/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java b/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java
index 34743811a1..217039767c 100644
--- a/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java
+++ b/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java
@@ -21,9 +21,11 @@
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.parse.ParseSegment;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.ReducerContextWrapper;
+import org.apache.nutch.util.TimingUtil;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -323,7 +325,149 @@ public void testCrawlDbReducerNotModified() {
}
}
- protected class CrawlTestFetchNotModified extends ContinuousCrawlTestUtil {
+
+ /**
+ * NUTCH-1245: a fetch_gone should always result in a db_gone.
+ *
+ * Even in a long-running continuous crawl, when a gone page is re-fetched
+ * several times over time.
+ *
+ */
+ @Test
+ public void testCrawlDbReducerPageGoneSchedule1() {
+ LOG.info("NUTCH-1245: test long running continuous crawl");
+ ContinuousCrawlTestUtil crawlUtil = new ContinuousCrawlTestUtil(
+ STATUS_FETCH_GONE, STATUS_DB_GONE);
+ try {
+ if (!crawlUtil.run(20)) {
+ fail("fetch_gone did not result in a db_gone (NUTCH-1245)");
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ /**
+ * NUTCH-1245: a fetch_gone should always result in a db_gone.
+ *
+ * As some kind of misconfiguration set db.fetch.interval.default to a value
+ * > (fetchIntervalMax * 1.5).
+ *
+ */
+ @Test
+ public void testCrawlDbReducerPageGoneSchedule2() {
+ LOG.info("NUTCH-1245 (misconfiguration): test with db.fetch.interval.default > (1.5 * db.fetch.interval.max)");
+ Context context = CrawlDBTestUtil.createContext();
+ Configuration conf = context.getConfiguration();
+ int fetchIntervalMax = conf.getInt("db.fetch.interval.max", 0);
+ conf.setInt("db.fetch.interval.default", 3 + (int) (fetchIntervalMax * 1.5));
+ ContinuousCrawlTestUtil crawlUtil = new ContinuousCrawlTestUtil(context,
+ STATUS_FETCH_GONE, STATUS_DB_GONE);
+ try {
+ if (!crawlUtil.run(0)) {
+ fail("fetch_gone did not result in a db_gone (NUTCH-1245)");
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+
+ @Test
+ public void testCrawlDbReducerParseFailed() {
+ LOG.info("NUTCH-1732: allow deleting un-parsable documents");
+ Context context = CrawlDBTestUtil.createContext();
+ Configuration conf = context.getConfiguration();
+ conf.setBoolean(ParseSegment.DELETE_FAILED_PARSE, true);
+ CrawlTestParserFailure crawlUtil = new CrawlTestParserFailure(context);
+ try {
+ if (!crawlUtil.run(20)) {
+ fail("parse failure did not result in a parse_fail (NUTCH-1732)");
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ /**
+ * Test whether signatures are reset for "content-less" states (gone,
+ * redirect, etc.): otherwise, if this state is temporary and the document
+ * appears again with the old content, it may get marked as not_modified in
+ * CrawlDb just after the redirect state. In this case we cannot expect
+ * content in segments. Cf. NUTCH-1422: reset signature for redirects.
+ */
+ // TODO: can only test if solution is done in CrawlDbReducer
+ @Test
+ public void testSignatureReset() {
+ LOG.info("NUTCH-1422 must reset signature for redirects and similar states");
+ Context context = CrawlDBTestUtil.createContext();
+ Configuration conf = context.getConfiguration();
+ for (String sched : schedules) {
+ LOG.info("Testing reset signature with {}", sched);
+ conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl." + sched);
+ ContinuousCrawlTestUtil crawlUtil = new CrawlTestSignatureReset(context);
+ try {
+ if (!crawlUtil.run(20)) {
+ fail("failed: signature not reset");
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+
+ /**
+ * NUTCH-578: a fetch_retry should result in a db_gone if db.fetch.retry.max
+ * is reached. Retry counter has to be reset appropriately.
+ */
+ @Test
+ public void testCrawlDbReducerPageRetrySchedule() {
+ LOG.info("NUTCH-578: test long running continuous crawl with fetch_retry");
+ ContinuousCrawlTestUtil crawlUtil = new ContinuousCrawlTestFetchRetry();
+ // keep going for long, to "provoke" a retry counter overflow
+ try {
+ if (!crawlUtil.run(150)) {
+ fail("fetch_retry did not result in a db_gone if retry counter > maxRetries (NUTCH-578)");
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ /**
+ * NUTCH-1564 AdaptiveFetchSchedule: sync_delta forces immediate re-fetch for
+ * documents not modified
+ *
+ * Problem: documents not modified for a longer time are fetched in every
+ * cycle because of an error in the SYNC_DELTA calculation of
+ * {@link AdaptiveFetchSchedule}.
+ * The next fetch time should always be in the future, never in the past.
+ *
+ */
+ @Test
+ public void testAdaptiveFetchScheduleSyncDelta() {
+ LOG.info("NUTCH-1564 test SYNC_DELTA calculation of AdaptiveFetchSchedule");
+ Context context = CrawlDBTestUtil.createContext();
+ Configuration conf = context.getConfiguration();
+ conf.setLong("db.fetch.interval.default", 172800); // 2 days
+ conf.setLong("db.fetch.schedule.adaptive.min_interval", 86400); // 1 day
+ conf.setLong("db.fetch.schedule.adaptive.max_interval", 604800); // 7 days
+ conf.setLong("db.fetch.interval.max", 604800); // 7 days
+ conf.set("db.fetch.schedule.class",
+ "org.apache.nutch.crawl.AdaptiveFetchSchedule");
+ ContinuousCrawlTestUtil crawlUtil = new CrawlTestFetchScheduleNotModifiedFetchTime(
+ context);
+ crawlUtil.setInterval(FetchSchedule.SECONDS_PER_DAY / 3);
+ try {
+ if (!crawlUtil.run(100)) {
+ fail("failed: sync_delta calculation with AdaptiveFetchSchedule");
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ private class CrawlTestFetchNotModified extends ContinuousCrawlTestUtil {
/** time of the current fetch */
protected long currFetchTime;
@@ -451,7 +595,7 @@ protected CrawlDatum fetch(CrawlDatum datum, long currentTime) {
}
}
- protected class CrawlTestFetchNotModifiedHttp304 extends
+ private class CrawlTestFetchNotModifiedHttp304 extends
CrawlTestFetchNotModified {
CrawlTestFetchNotModifiedHttp304(Context context) {
@@ -487,79 +631,6 @@ protected CrawlDatum fetch(CrawlDatum datum, long currentTime) {
}
}
- /**
- * NUTCH-1245: a fetch_gone should always result in a db_gone.
- *
- * Even in a long-running continuous crawl, when a gone page is re-fetched
- * several times over time.
- *
- */
- @Test
- public void testCrawlDbReducerPageGoneSchedule1() {
- LOG.info("NUTCH-1245: test long running continuous crawl");
- ContinuousCrawlTestUtil crawlUtil = new ContinuousCrawlTestUtil(
- STATUS_FETCH_GONE, STATUS_DB_GONE);
- try {
- if (!crawlUtil.run(20)) {
- fail("fetch_gone did not result in a db_gone (NUTCH-1245)");
- }
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
-
- /**
- * NUTCH-1245: a fetch_gone should always result in a db_gone.
- *
- * As some kind of misconfiguration set db.fetch.interval.default to a value
- * > (fetchIntervalMax * 1.5).
- *
- */
- @Test
- public void testCrawlDbReducerPageGoneSchedule2() {
- LOG.info("NUTCH-1245 (misconfiguration): test with db.fetch.interval.default > (1.5 * db.fetch.interval.max)");
- Context context = CrawlDBTestUtil.createContext();
- Configuration conf = context.getConfiguration();
- int fetchIntervalMax = conf.getInt("db.fetch.interval.max", 0);
- conf.setInt("db.fetch.interval.default", 3 + (int) (fetchIntervalMax * 1.5));
- ContinuousCrawlTestUtil crawlUtil = new ContinuousCrawlTestUtil(context,
- STATUS_FETCH_GONE, STATUS_DB_GONE);
- try {
- if (!crawlUtil.run(0)) {
- fail("fetch_gone did not result in a db_gone (NUTCH-1245)");
- }
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
-
- /**
- * Test whether signatures are reset for "content-less" states (gone,
- * redirect, etc.): otherwise, if this state is temporary and the document
- * appears again with the old content, it may get marked as not_modified in
- * CrawlDb just after the redirect state. In this case we cannot expect
- * content in segments. Cf. NUTCH-1422: reset signature for redirects.
- */
- // TODO: can only test if solution is done in CrawlDbReducer
- @Test
- public void testSignatureReset() {
- LOG.info("NUTCH-1422 must reset signature for redirects and similar states");
- Context context = CrawlDBTestUtil.createContext();
- Configuration conf = context.getConfiguration();
- for (String sched : schedules) {
- LOG.info("Testing reset signature with {}", sched);
- conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl." + sched);
- ContinuousCrawlTestUtil crawlUtil = new CrawlTestSignatureReset(context);
- try {
- if (!crawlUtil.run(20)) {
- fail("failed: signature not reset");
- }
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
-
private class CrawlTestSignatureReset extends ContinuousCrawlTestUtil {
byte[][] noContentStates = { { STATUS_FETCH_GONE, STATUS_DB_GONE },
@@ -607,4 +678,166 @@ protected boolean check(CrawlDatum result) {
}
+
+ private class ContinuousCrawlTestFetchRetry extends ContinuousCrawlTestUtil {
+
+ private int retryMax = 3;
+ private int totalRetries = 0;
+
+ ContinuousCrawlTestFetchRetry() {
+ super();
+ fetchStatus = STATUS_FETCH_RETRY;
+ retryMax = context.getConfiguration().getInt("db.fetch.retry.max", retryMax);
+ }
+
+ @Override
+ protected CrawlDatum fetch(CrawlDatum datum, long currentTime) {
+ datum.setStatus(fetchStatus);
+ datum.setFetchTime(currentTime);
+ totalRetries++;
+ return datum;
+ }
+
+ @Override
+ protected boolean check(CrawlDatum result) {
+ if (result.getRetriesSinceFetch() > retryMax) {
+ LOG.warn("Retry counter > db.fetch.retry.max: {}", result);
+ } else if (result.getRetriesSinceFetch() == Byte.MAX_VALUE) {
+ LOG.warn("Retry counter max. value reached (overflow imminent): {}", result);
+ } else if (result.getRetriesSinceFetch() < 0) {
+ LOG.error("Retry counter overflow: {}", result);
+ return false;
+ }
+ // use retry counter bound to this class (totalRetries)
+ // instead of result.getRetriesSinceFetch() because the retry counter
+ // in CrawlDatum could be reset (eg. NUTCH-578_v5.patch)
+ if (totalRetries < retryMax) {
+ if (result.getStatus() == STATUS_DB_UNFETCHED) {
+ LOG.info("ok: {}", result);
+ result.getRetriesSinceFetch();
+ return true;
+ }
+ } else {
+ if (result.getStatus() == STATUS_DB_GONE) {
+ LOG.info("ok: {}", result);
+ return true;
+ }
+ }
+ LOG.warn("wrong: {}", result);
+ return false;
+ }
+
+ }
+
+ private class CrawlTestFetchScheduleNotModifiedFetchTime extends
+ CrawlTestFetchNotModified {
+
+ // time of current fetch
+ private long fetchTime;
+
+ private long minInterval;
+ private long maxInterval;
+
+ CrawlTestFetchScheduleNotModifiedFetchTime(Context context) {
+ super(context);
+ Configuration conf = context.getConfiguration();
+ minInterval = conf.getLong("db.fetch.schedule.adaptive.min_interval",
+ 86400); // 1 day
+ maxInterval = conf.getLong("db.fetch.schedule.adaptive.max_interval",
+ 604800); // 7 days
+ if (conf.getLong("db.fetch.interval.max", 604800) < maxInterval) {
+ maxInterval = conf.getLong("db.fetch.interval.max", 604800);
+ }
+ }
+
+ @Override
+ protected CrawlDatum fetch(CrawlDatum datum, long currentTime) {
+ // remember time of fetching
+ fetchTime = currentTime;
+ return super.fetch(datum, currentTime);
+ }
+
+ @Override
+ protected boolean check(CrawlDatum result) {
+ if (result.getStatus() == STATUS_DB_NOTMODIFIED) {
+ // check only status notmodified here
+ long secondsUntilNextFetch = (result.getFetchTime() - fetchTime) / 1000L;
+ if (secondsUntilNextFetch < -1) {
+ // next fetch time is in the past (more than one second)
+ LOG.error("Next fetch time is in the past: {}", result);
+ return false;
+ }
+ if (secondsUntilNextFetch < 60) {
+ // next fetch time is in less than one minute
+ // (critical: Nutch can hardly be so fast)
+ LOG.error("Less then one minute until next fetch: {}", result);
+ }
+ // Next fetch time should be within min. and max. (tolerance: 60 sec.)
+ if (secondsUntilNextFetch + 60 < minInterval
+ || secondsUntilNextFetch - 60 > maxInterval) {
+ LOG.error(
+ "Interval until next fetch time ({}) is not within min. and max. interval: {}",
+ TimingUtil.elapsedTime(fetchTime, result.getFetchTime()), result);
+ // TODO: is this a failure?
+ }
+ }
+ return true;
+ }
+
+ }
+
+ private class CrawlTestParserFailure extends ContinuousCrawlTestUtil {
+
+ int counter = 0;
+ boolean failing = false;
+
+ public CrawlTestParserFailure(Context context) {
+ super(context);
+ }
+
+ @Override
+ protected CrawlDatum fetch(CrawlDatum datum, long currentTime) {
+ counter++;
+ if(counter % 2 == 0) {
+ failing = true;
+ // STATUS_PARSE_FAILED is normally set in FetcherThread.run from a fetch success
+ // This test is for CrawlDbReducer, after fetch
+ datum.setStatus(STATUS_PARSE_FAILED);
+ LOG.info("expect parse failed");
+ } else {
+ failing = false;
+ datum.setStatus(STATUS_FETCH_SUCCESS);
+ LOG.info("expect fetch success");
+ }
+ datum.setFetchTime(currentTime);
+ return datum;
+ }
+
+ @Override
+ protected List parse(CrawlDatum fetchDatum) {
+ List parseDatums = new ArrayList(0);
+ if (failing){
+ LOG.info("set parse failed");
+ parseDatums.add(new CrawlDatum(STATUS_PARSE_FAILED, 0));
+ } else {
+ LOG.info("set signature");
+ CrawlDatum signed = new CrawlDatum(STATUS_SIGNATURE, 0);
+ signed.setSignature(getSignature());
+ parseDatums.add(signed);
+ }
+ return parseDatums;
+ }
+
+ @Override
+ protected boolean check(CrawlDatum result) {
+ if (failing) {
+ LOG.info("check parse failed");
+ return result.getStatus() == STATUS_DB_PARSE_FAILED;
+ } else {
+ LOG.info("check fetched");
+ return result.getStatus() == STATUS_DB_FETCHED || result.getStatus() == STATUS_DB_NOTMODIFIED;
+ }
+ }
+ }
+
}
diff --git a/src/test/org/apache/nutch/crawl/TestCrawlDbStatesExtended.java b/src/test/org/apache/nutch/crawl/TestCrawlDbStatesExtended.java
deleted file mode 100644
index 2e6ea55af1..0000000000
--- a/src/test/org/apache/nutch/crawl/TestCrawlDbStatesExtended.java
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.crawl;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.mapreduce.Reducer.Context;
-import org.apache.nutch.util.TimingUtil;
-import org.junit.jupiter.api.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.lang.invoke.MethodHandles;
-
-import static org.apache.nutch.crawl.CrawlDatum.*;
-import static org.junit.jupiter.api.Assertions.fail;
-
-public class TestCrawlDbStatesExtended extends TestCrawlDbStates {
-
- private static final Logger LOG = LoggerFactory
- .getLogger(MethodHandles.lookup().lookupClass());
-
- /**
- * NUTCH-578: a fetch_retry should result in a db_gone if db.fetch.retry.max
- * is reached. Retry counter has to be reset appropriately.
- */
- @Test
- public void testCrawlDbReducerPageRetrySchedule() {
- LOG.info("NUTCH-578: test long running continuous crawl with fetch_retry");
- ContinuousCrawlTestUtil crawlUtil = new ContinuousCrawlTestFetchRetry();
- // keep going for long, to "provoke" a retry counter overflow
- try {
- if (!crawlUtil.run(150)) {
- fail("fetch_retry did not result in a db_gone if retry counter > maxRetries (NUTCH-578)");
- }
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
-
- private class ContinuousCrawlTestFetchRetry extends ContinuousCrawlTestUtil {
-
- private int retryMax = 3;
- private int totalRetries = 0;
-
- ContinuousCrawlTestFetchRetry() {
- super();
- fetchStatus = STATUS_FETCH_RETRY;
- retryMax = context.getConfiguration().getInt("db.fetch.retry.max", retryMax);
- }
-
- @Override
- protected CrawlDatum fetch(CrawlDatum datum, long currentTime) {
- datum.setStatus(fetchStatus);
- datum.setFetchTime(currentTime);
- totalRetries++;
- return datum;
- }
-
- @Override
- protected boolean check(CrawlDatum result) {
- if (result.getRetriesSinceFetch() > retryMax) {
- LOG.warn("Retry counter > db.fetch.retry.max: {}", result);
- } else if (result.getRetriesSinceFetch() == Byte.MAX_VALUE) {
- LOG.warn("Retry counter max. value reached (overflow imminent): {}", result);
- } else if (result.getRetriesSinceFetch() < 0) {
- LOG.error("Retry counter overflow: {}", result);
- return false;
- }
- // use retry counter bound to this class (totalRetries)
- // instead of result.getRetriesSinceFetch() because the retry counter
- // in CrawlDatum could be reset (eg. NUTCH-578_v5.patch)
- if (totalRetries < retryMax) {
- if (result.getStatus() == STATUS_DB_UNFETCHED) {
- LOG.info("ok: {}", result);
- result.getRetriesSinceFetch();
- return true;
- }
- } else {
- if (result.getStatus() == STATUS_DB_GONE) {
- LOG.info("ok: {}", result);
- return true;
- }
- }
- LOG.warn("wrong: {}", result);
- return false;
- }
-
- }
-
- /**
- * NUTCH-1564 AdaptiveFetchSchedule: sync_delta forces immediate re-fetch for
- * documents not modified
- *
- * Problem: documents not modified for a longer time are fetched in every
- * cycle because of an error in the SYNC_DELTA calculation of
- * {@link AdaptiveFetchSchedule}.
- * The next fetch time should always be in the future, never in the past.
- *
- */
- @Test
- public void testAdaptiveFetchScheduleSyncDelta() {
- LOG.info("NUTCH-1564 test SYNC_DELTA calculation of AdaptiveFetchSchedule");
- Context context = CrawlDBTestUtil.createContext();
- Configuration conf = context.getConfiguration();
- conf.setLong("db.fetch.interval.default", 172800); // 2 days
- conf.setLong("db.fetch.schedule.adaptive.min_interval", 86400); // 1 day
- conf.setLong("db.fetch.schedule.adaptive.max_interval", 604800); // 7 days
- conf.setLong("db.fetch.interval.max", 604800); // 7 days
- conf.set("db.fetch.schedule.class",
- "org.apache.nutch.crawl.AdaptiveFetchSchedule");
- ContinuousCrawlTestUtil crawlUtil = new CrawlTestFetchScheduleNotModifiedFetchTime(
- context);
- crawlUtil.setInterval(FetchSchedule.SECONDS_PER_DAY / 3);
- try {
- if (!crawlUtil.run(100)) {
- fail("failed: sync_delta calculation with AdaptiveFetchSchedule");
- }
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
-
- private class CrawlTestFetchScheduleNotModifiedFetchTime extends
- CrawlTestFetchNotModified {
-
- // time of current fetch
- private long fetchTime;
-
- private long minInterval;
- private long maxInterval;
-
- CrawlTestFetchScheduleNotModifiedFetchTime(Context context) {
- super(context);
- Configuration conf = context.getConfiguration();
- minInterval = conf.getLong("db.fetch.schedule.adaptive.min_interval",
- 86400); // 1 day
- maxInterval = conf.getLong("db.fetch.schedule.adaptive.max_interval",
- 604800); // 7 days
- if (conf.getLong("db.fetch.interval.max", 604800) < maxInterval) {
- maxInterval = conf.getLong("db.fetch.interval.max", 604800);
- }
- }
-
- @Override
- protected CrawlDatum fetch(CrawlDatum datum, long currentTime) {
- // remember time of fetching
- fetchTime = currentTime;
- return super.fetch(datum, currentTime);
- }
-
- @Override
- protected boolean check(CrawlDatum result) {
- if (result.getStatus() == STATUS_DB_NOTMODIFIED) {
- // check only status notmodified here
- long secondsUntilNextFetch = (result.getFetchTime() - fetchTime) / 1000L;
- if (secondsUntilNextFetch < -1) {
- // next fetch time is in the past (more than one second)
- LOG.error("Next fetch time is in the past: {}", result);
- return false;
- }
- if (secondsUntilNextFetch < 60) {
- // next fetch time is in less than one minute
- // (critical: Nutch can hardly be so fast)
- LOG.error("Less then one minute until next fetch: {}", result);
- }
- // Next fetch time should be within min. and max. (tolerance: 60 sec.)
- if (secondsUntilNextFetch + 60 < minInterval
- || secondsUntilNextFetch - 60 > maxInterval) {
- LOG.error(
- "Interval until next fetch time ({}) is not within min. and max. interval: {}",
- TimingUtil.elapsedTime(fetchTime, result.getFetchTime()), result);
- // TODO: is this a failure?
- }
- }
- return true;
- }
-
- }
-
-}
diff --git a/src/test/org/apache/nutch/fetcher/TestFetchWithParseFailures.java b/src/test/org/apache/nutch/fetcher/TestFetchWithParseFailures.java
new file mode 100644
index 0000000000..0ed666558d
--- /dev/null
+++ b/src/test/org/apache/nutch/fetcher/TestFetchWithParseFailures.java
@@ -0,0 +1,264 @@
+package org.apache.nutch.fetcher;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.stream.Collectors;
+
+import javax.servlet.ServletException;
+import javax.servlet.http.HttpServletRequest;
+import javax.servlet.http.HttpServletResponse;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.AbstractFetchSchedule;
+import org.apache.nutch.crawl.CrawlDBTestUtil;
+import org.apache.nutch.crawl.Generator;
+import org.apache.nutch.crawl.Injector;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.ParseSegment;
+import org.apache.nutch.protocol.Content;
+import org.eclipse.jetty.server.Request;
+import org.eclipse.jetty.server.Server;
+import org.eclipse.jetty.server.handler.ResourceHandler;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.CrawlDb;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+public class TestFetchWithParseFailures {
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(MethodHandles.lookup().lookupClass());
+
+ private static final Path TEST_DIR = new Path("build/test/test-fail-parse");
+
+ private static final String BASE_FOLDER = "build/test/data/fetch-parse-failure";
+ private static final String TEST_FILE = "test.html";
+
+ private Configuration conf;
+ private FileSystem fs;
+ private Path crawldbPath;
+ private Path segmentsPath;
+ private Path urlPath;
+ private Server server;
+
+ private static List files;
+ private static java.nio.file.Path baseFolderPath;
+
+ private static ExecutorService executor = Executors.newCachedThreadPool();
+ private static final AtomicInteger FETCH_COUNT = new AtomicInteger(0);
+
+
+ @BeforeEach
+ public void setUp() throws Exception {
+ baseFolderPath = java.nio.file.Paths.get(BASE_FOLDER);
+
+ files = java.nio.file.Files.list(baseFolderPath).map(p -> p.getFileName().toString()).filter(n -> !n.equals("robots.txt")).collect(Collectors.toList());
+ Collections.sort(files);
+ LOG.info("TEST FILES : " + files);
+
+ conf = CrawlDBTestUtil.createContext().getConfiguration();
+ // Do not include 'parse-tika', because it would parse anything
+ conf.set("plugin.includes", "protocol-http|urlfilter-regex|parse-html|index-(basic|anchor)|indexer-csv|scoring-opic|urlnormalizer-(pass|regex|basic)");
+ conf.setInt("fetcher.threads.fetch", 1);
+ // force an exception from ParserFactory.getParsers(...): ParserNotFound
+ conf.setBoolean("mime.type.magic", false);
+ conf.setBoolean("fetcher.parse", true);
+ conf.setBoolean("fetcher.store.content", true);
+ conf.setBoolean(ParseSegment.DELETE_FAILED_PARSE, true);
+
+ fs = FileSystem.get(conf);
+ fs.delete(TEST_DIR, true);
+ crawldbPath = new Path(TEST_DIR, "crawldb");
+ segmentsPath = new Path(TEST_DIR, "segments");
+ urlPath = new Path(TEST_DIR, "urls");
+ server = CrawlDBTestUtil.getServer(
+ conf.getInt("content.server.port", 1234),
+ BASE_FOLDER, new ParseFailureResourceHandler());
+ server.start();
+ }
+
+ @AfterEach
+ public void tearDown() throws Exception {
+ executor.shutdown();
+ server.stop();
+ for (int i = 0; i < 5; i++) {
+ if (!server.isStopped()) {
+ Thread.sleep(1000);
+ }
+ }
+ fs.delete(TEST_DIR, true);
+ }
+
+
+ @Test
+ public void testFetchWithParseFailure() throws Exception {
+ AbstractFetchSchedule schedule = new AbstractFetchSchedule(conf) {};
+
+ // generate seedlist
+ ArrayList urls = new ArrayList();
+ files.forEach(f -> urls.add("http://127.0.0.1:" + server.getURI().getPort() + "/" + f));
+ CrawlDBTestUtil.generateSeedList(fs, urlPath, urls);
+
+ // inject
+ Injector injector = new Injector(conf);
+ injector.inject(crawldbPath, urlPath);
+
+ // generate
+ Generator g1 = new Generator(conf);
+ Path[] generatedSegment1 = g1.generate(crawldbPath, segmentsPath, 1,
+ Long.MAX_VALUE, Long.MAX_VALUE, false, false, true, 1, null);
+ Assertions.assertNotNull(generatedSegment1);
+
+ Map args1 = Map.of("segment", generatedSegment1[0]);
+ // fetch once
+ LOG.info("1ST FETCH");
+ Fetcher fetcher1 = new Fetcher(conf);
+ Map result1 = executor.submit(new Callable