Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion conf/nutch-default.xml
Original file line number Diff line number Diff line change
Expand Up @@ -1503,6 +1503,14 @@
</description>
</property>

<property>
<name>indexer.delete</name>
<value>false</value>
<description>Whether the indexer will delete documents that are gone. Gone pages include redirects and duplicates.
See also: 'link.delete.gone'.
</description>
</property>

<property>
<name>indexer.indexwriters.file</name>
<value>index-writers.xml</value>
Expand Down Expand Up @@ -1797,6 +1805,15 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
</description>
</property>

<property>
<name>parser.delete.failed.parse</name>
<value>false</value>
<description>Boolean value for whether we should delete a page from the index when parsing the page fails.
By default this property is deactivated, because it will delete an existing page from the index, where a
previous fetch produced content that was successfully parsed.
</description>
</property>

<property>
<name>parser.store.text</name>
<value>true</value>
Expand Down Expand Up @@ -2507,7 +2524,7 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
<property>
<name>link.delete.gone</name>
<value>false</value>
<description>Whether to delete gone pages from the web graph.</description>
<description>Whether to delete gone pages from the web graph. Gone pages include redirects and duplicates.</description>
</property>

<property>
Expand Down
18 changes: 11 additions & 7 deletions src/java/org/apache/nutch/crawl/CrawlDatum.java
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
public static final byte STATUS_DB_DUPLICATE = 0x07;
/** Page was marked as orphan, e.g. has no inlinks anymore */
public static final byte STATUS_DB_ORPHAN = 0x08;
/** Page parsing failed */
public static final byte STATUS_DB_PARSE_FAILED = 0x09;

/** Maximum value of DB-related status. */
public static final byte STATUS_DB_MAX = 0x1f;
Expand Down Expand Up @@ -103,6 +105,8 @@ public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
public static final byte STATUS_LINKED = 0x43;
/** Page got metadata from a parser */
public static final byte STATUS_PARSE_META = 0x44;
/** Page parse failed */
public static final byte STATUS_PARSE_FAILED = 0x45;

public static final HashMap<Byte, String> statNames = new HashMap<>();
static {
Expand All @@ -114,6 +118,7 @@ public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
statNames.put(STATUS_DB_NOTMODIFIED, "db_notmodified");
statNames.put(STATUS_DB_DUPLICATE, "db_duplicate");
statNames.put(STATUS_DB_ORPHAN, "db_orphan");
statNames.put(STATUS_DB_PARSE_FAILED, "db_parse_failed");
statNames.put(STATUS_SIGNATURE, "signature");
statNames.put(STATUS_INJECTED, "injected");
statNames.put(STATUS_LINKED, "linked");
Expand All @@ -124,6 +129,7 @@ public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
statNames.put(STATUS_FETCH_GONE, "fetch_gone");
statNames.put(STATUS_FETCH_NOTMODIFIED, "fetch_notmodified");
statNames.put(STATUS_PARSE_META, "parse_metadata");
statNames.put(STATUS_PARSE_FAILED, "parse_failed");

oldToNew.put(OLD_STATUS_DB_UNFETCHED, STATUS_DB_UNFETCHED);
oldToNew.put(OLD_STATUS_DB_FETCHED, STATUS_DB_FETCHED);
Expand All @@ -144,16 +150,14 @@ public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
private long modifiedTime;
private org.apache.hadoop.io.MapWritable metaData;

/** Validate DB Status (ref.: CrawlDbReducer, IndexerReducer) */
public static boolean hasDbStatus(CrawlDatum datum) {
if (datum.status <= STATUS_DB_MAX)
return true;
return false;
return (datum.status <= STATUS_DB_MAX) || CrawlDatum.STATUS_DB_PARSE_FAILED == datum.getStatus();
}

/** Validate Fetch Status (ref.: CrawlDbReducer, IndexerReducer, SegmentMergerReducer) */
public static boolean hasFetchStatus(CrawlDatum datum) {
if (datum.status > STATUS_DB_MAX && datum.status <= STATUS_FETCH_MAX)
return true;
return false;
return (datum.status > STATUS_DB_MAX && datum.status <= STATUS_FETCH_MAX)
|| CrawlDatum.STATUS_PARSE_FAILED == datum.getStatus();
}

public CrawlDatum() {
Expand Down
12 changes: 10 additions & 2 deletions src/java/org/apache/nutch/crawl/CrawlDbReducer.java
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ public void reduce(Text key, Iterable<CrawlDatum> values,
}
continue;
}

switch (datum.getStatus()) { // collect other info
case CrawlDatum.STATUS_LINKED:
CrawlDatum link;
Expand Down Expand Up @@ -233,7 +233,7 @@ public void reduce(Text key, Iterable<CrawlDatum> values,
}
break;

case CrawlDatum.STATUS_FETCH_SUCCESS: // succesful fetch
case CrawlDatum.STATUS_FETCH_SUCCESS: // successful fetch
case CrawlDatum.STATUS_FETCH_REDIR_TEMP: // successful fetch, redirected
case CrawlDatum.STATUS_FETCH_REDIR_PERM:
case CrawlDatum.STATUS_FETCH_NOTMODIFIED: // successful fetch, notmodified
Expand Down Expand Up @@ -320,6 +320,14 @@ public void reduce(Text key, Iterable<CrawlDatum> values,
}
break;

case CrawlDatum.STATUS_PARSE_FAILED: // successful fetch, but parse failed
if (oldSet)
result.setSignature(old.getSignature()); // use old signature
result.setStatus(CrawlDatum.STATUS_DB_PARSE_FAILED);
result = schedule.setPageGoneSchedule(key, result, prevFetchTime,
prevModifiedTime, fetch.getFetchTime());
break;

case CrawlDatum.STATUS_FETCH_GONE: // permanent failure
if (oldSet)
result.setSignature(old.getSignature()); // use old signature
Expand Down
8 changes: 4 additions & 4 deletions src/java/org/apache/nutch/fetcher/Fetcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -577,11 +577,11 @@ public void fetch(Path segment, int threads) throws IOException,
} catch (InterruptedException | ClassNotFoundException e) {
LOG.error(StringUtils.stringifyException(e));
throw e;
} finally {
stopWatch.stop();
LOG.info("Fetcher: finished, elapsed: {} ms", stopWatch.getTime(
TimeUnit.MILLISECONDS));
}

stopWatch.stop();
LOG.info("Fetcher: finished, elapsed: {} ms", stopWatch.getTime(
TimeUnit.MILLISECONDS));
}

/**
Expand Down
18 changes: 14 additions & 4 deletions src/java/org/apache/nutch/fetcher/FetcherThread.java
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ public class FetcherThread extends Thread {
URLNormalizers normalizersForOutlinks;

private boolean skipTruncated;
private boolean deleteFailedParse;

private boolean halted = false;

Expand Down Expand Up @@ -181,6 +182,7 @@ public FetcherThread(Configuration conf, AtomicInteger activeThreads, FetchItemQ
this.scfilters = new ScoringFilters(conf);
this.parseUtil = new ParseUtil(conf);
this.skipTruncated = conf.getBoolean(ParseSegment.SKIP_TRUNCATED, true);
this.deleteFailedParse = conf.getBoolean(ParseSegment.DELETE_FAILED_PARSE, false);
this.signatureWithoutParsing = conf.getBoolean("fetcher.signature", false);
this.protocolFactory = new ProtocolFactory(conf);
this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER);
Expand Down Expand Up @@ -221,7 +223,7 @@ public FetcherThread(Configuration conf, AtomicInteger activeThreads, FetchItemQ
.getInt("http.robots.503.defer.visits.retries", 3);
}

if((activatePublisher=conf.getBoolean("fetcher.publisher", false)))
if ((activatePublisher = conf.getBoolean("fetcher.publisher", false)))
this.publisher = new FetcherThreadPublisher(conf);

queueMode = conf.get("fetcher.queue.mode",
Expand Down Expand Up @@ -442,7 +444,7 @@ public void run() {
case ProtocolStatus.SUCCESS: // got a page
pstatus = output(fit.url, fit.datum, content, status,
CrawlDatum.STATUS_FETCH_SUCCESS, fit.outlinkDepth);
updateStatus(content.getContent().length);
updateStatus(content.getContent() != null ? content.getContent().length : 0);
if (pstatus != null && pstatus.isSuccess()
&& pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
String newUrl = pstatus.getMessage();
Expand Down Expand Up @@ -731,14 +733,18 @@ private ParseStatus output(Text key, CrawlDatum datum, Content content,
.calculate(content, new ParseStatus().getEmptyParse(conf));
datum.setSignature(signature);
}

if (parseResult == null && parsing && deleteFailedParse) {
datum.setStatus(CrawlDatum.STATUS_PARSE_FAILED);
status = CrawlDatum.STATUS_PARSE_FAILED;
}
}

/*
* Store status code in content So we can read this value during parsing
* (as a separate job) and decide to parse or not.
*/
content.getMetadata().add(Nutch.FETCH_STATUS_KEY,
Integer.toString(status));
content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status));
}

try {
Expand All @@ -756,6 +762,10 @@ private ParseStatus output(Text key, CrawlDatum datum, Content content,
LOG.warn("{} {} Error parsing: {}: {}", getName(),
Thread.currentThread().getId(), key, parseStatus);
parse = parseStatus.getEmptyParse(conf);
if (deleteFailedParse && content != null) {
// forward the failure status in the content
content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(CrawlDatum.STATUS_PARSE_FAILED));
}
}

// Calculate page signature. For non-parsing fetchers this will
Expand Down
15 changes: 15 additions & 0 deletions src/java/org/apache/nutch/indexer/IndexerMapReduce.java
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseSegment;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.scoring.ScoringFilterException;
Expand Down Expand Up @@ -206,6 +207,7 @@ public static class IndexerReducer extends
private boolean delete = false;
private boolean deleteRobotsNoIndex = false;
private boolean deleteSkippedByIndexingFilter = false;
private boolean deleteFailedParse = false;
private boolean base64 = false;
private IndexingFilters filters;
private ScoringFilters scfilters;
Expand All @@ -226,6 +228,7 @@ public static class IndexerReducer extends
private Counter deletedGoneCounter;
private Counter deletedRedirectsCounter;
private Counter deletedDuplicatesCounter;
private Counter deletedFailedParseCounter;
private Counter skippedNotModifiedCounter;
private Counter deletedByIndexingFilterCounter;
private Counter skippedByIndexingFilterCounter;
Expand All @@ -244,6 +247,7 @@ public void setup(Reducer<Text, NutchWritable, Text, NutchIndexAction>.Context c
false);
deleteSkippedByIndexingFilter = conf.getBoolean(INDEXER_DELETE_SKIPPED,
false);
deleteFailedParse = conf.getBoolean(ParseSegment.DELETE_FAILED_PARSE, false);
skip = conf.getBoolean(INDEXER_SKIP_NOTMODIFIED, false);
base64 = conf.getBoolean(INDEXER_BINARY_AS_BASE64, false);

Expand Down Expand Up @@ -279,6 +283,8 @@ private void initCounters(Reducer<Text, NutchWritable, Text, NutchIndexAction>.C
NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_DELETED_REDIRECTS_TOTAL);
deletedDuplicatesCounter = context.getCounter(
NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_DELETED_DUPLICATES_TOTAL);
deletedFailedParseCounter = context.getCounter(
NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_DELETED_FAILED_PARSE_TOTAL);
skippedNotModifiedCounter = context.getCounter(
NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_SKIPPED_NOT_MODIFIED_TOTAL);
deletedByIndexingFilterCounter = context.getCounter(
Expand Down Expand Up @@ -354,6 +360,15 @@ public void reduce(Text key, Iterable<NutchWritable> values,
}
}

// Whether to delete pages where parsing failed
if (deleteFailedParse && fetchDatum != null) {
if (fetchDatum.getStatus() == CrawlDatum.STATUS_PARSE_FAILED
|| dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_PARSE_FAILED) {
deletedFailedParseCounter.increment(1);
context.write(key, DELETE_ACTION);
return;
}
}
// Whether to delete GONE or REDIRECTS
if (delete && fetchDatum != null) {
if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE
Expand Down
62 changes: 31 additions & 31 deletions src/java/org/apache/nutch/metadata/Nutch.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,62 +26,62 @@
*/
public interface Nutch {

public static final String ORIGINAL_CHAR_ENCODING = "OriginalCharEncoding";
public static final String ORIGINAL_CHAR_ENCODING = "OriginalCharEncoding";

public static final String CHAR_ENCODING_FOR_CONVERSION = "CharEncodingForConversion";
public static final String CHAR_ENCODING_FOR_CONVERSION = "CharEncodingForConversion";

public static final String SIGNATURE_KEY = "nutch.content.digest";
public static final String SIGNATURE_KEY = "nutch.content.digest";

public static final String SEGMENT_NAME_KEY = "nutch.segment.name";
public static final String SEGMENT_NAME_KEY = "nutch.segment.name";

public static final String SCORE_KEY = "nutch.crawl.score";
public static final String SCORE_KEY = "nutch.crawl.score";

public static final String GENERATE_TIME_KEY = "_ngt_";
public static final String GENERATE_TIME_KEY = "_ngt_";

public static final Text WRITABLE_GENERATE_TIME_KEY = new Text(
GENERATE_TIME_KEY);
public static final Text WRITABLE_GENERATE_TIME_KEY = new Text(
GENERATE_TIME_KEY);

public static final Text PROTOCOL_STATUS_CODE_KEY = new Text("nutch.protocol.code");
public static final Text PROTOCOL_STATUS_CODE_KEY = new Text("nutch.protocol.code");

public static final String PROTO_STATUS_KEY = "_pst_";
public static final String PROTO_STATUS_KEY = "_pst_";

public static final Text WRITABLE_PROTO_STATUS_KEY = new Text(
PROTO_STATUS_KEY);
public static final Text WRITABLE_PROTO_STATUS_KEY = new Text(
PROTO_STATUS_KEY);

public static final String FETCH_TIME_KEY = "_ftk_";
public static final String FETCH_TIME_KEY = "_ftk_";

public static final String FETCH_STATUS_KEY = "_fst_";
public static final String FETCH_STATUS_KEY = "_fst_";

/**
* Name to store the <a href="https://www.robotstxt.org/meta.html">robots
* metatag</a> in {@link org.apache.nutch.parse.ParseData}'s metadata.
*/
public static final String ROBOTS_METATAG = "robots";

/**
* Sites may request that search engines don't provide access to cached
* documents.
*/
public static final String CACHING_FORBIDDEN_KEY = "caching.forbidden";
/**
* Sites may request that search engines don't provide access to cached
* documents.
*/
public static final String CACHING_FORBIDDEN_KEY = "caching.forbidden";

/** Show both original forbidden content and summaries (default). */
public static final String CACHING_FORBIDDEN_NONE = "none";
/** Show both original forbidden content and summaries (default). */
public static final String CACHING_FORBIDDEN_NONE = "none";

/** Don't show either original forbidden content or summaries. */
public static final String CACHING_FORBIDDEN_ALL = "all";
/** Don't show either original forbidden content or summaries. */
public static final String CACHING_FORBIDDEN_ALL = "all";

/** Don't show original forbidden content, but show summaries. */
public static final String CACHING_FORBIDDEN_CONTENT = "content";
/** Don't show original forbidden content, but show summaries. */
public static final String CACHING_FORBIDDEN_CONTENT = "content";

public static final String REPR_URL_KEY = "_repr_";
public static final String REPR_URL_KEY = "_repr_";

public static final Text WRITABLE_REPR_URL_KEY = new Text(REPR_URL_KEY);
public static final Text WRITABLE_REPR_URL_KEY = new Text(REPR_URL_KEY);

/** Used by AdaptiveFetchSchedule to maintain custom fetch interval */
public static final String FIXED_INTERVAL_KEY = "fixedInterval";
/** Used by AdaptiveFetchSchedule to maintain custom fetch interval */
public static final String FIXED_INTERVAL_KEY = "fixedInterval";

public static final Text WRITABLE_FIXED_INTERVAL_KEY = new Text(
FIXED_INTERVAL_KEY);
public static final Text WRITABLE_FIXED_INTERVAL_KEY = new Text(
FIXED_INTERVAL_KEY);

/** For progress of job (programmatic / tooling). */
public static final String STAT_PROGRESS = "progress";
Expand Down
3 changes: 3 additions & 0 deletions src/java/org/apache/nutch/metrics/NutchMetrics.java
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,9 @@ private NutchMetrics() {
/** Documents deleted as duplicates. */
public static final String INDEXER_DELETED_DUPLICATES_TOTAL = "deleted_duplicates_total";

/** Documents deleted because parsing failed. */
public static final String INDEXER_DELETED_FAILED_PARSE_TOTAL = "deleted_failed_parse_total";

/** Documents deleted by indexing filter. */
public static final String INDEXER_DELETED_BY_INDEXING_FILTER_TOTAL = "deleted_by_indexing_filter_total";

Expand Down
Loading
Loading