apache · igiguere · Feb 5, 2026 · Feb 11, 2026 · Feb 15, 2026 · Feb 18, 2026
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
@@ -1503,6 +1503,14 @@
   </description>
 </property>
 
+<property>
+  <name>indexer.delete</name>
+  <value>false</value>
+  <description>Whether the indexer will delete documents that are gone. Gone pages include redirects and duplicates.
+  See also: 'link.delete.gone'.
+  </description>
+</property>
+
 <property>
   <name>indexer.indexwriters.file</name>
   <value>index-writers.xml</value>
@@ -1797,6 +1805,15 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
   </description>
 </property>
 
+<property>
+  <name>parser.delete.failed.parse</name>
+  <value>false</value>
+  <description>Boolean value for whether we should delete a page from the index when parsing the page fails. 
+  By default this property is deactivated, because it will delete an existing page from the index, where a 
+  previous fetch produced content that was successfully parsed.
+  </description>
+</property>
+
 <property>
   <name>parser.store.text</name>
   <value>true</value>
@@ -2507,7 +2524,7 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
 <property>
   <name>link.delete.gone</name>
   <value>false</value>
-  <description>Whether to delete gone pages from the web graph.</description>
+  <description>Whether to delete gone pages from the web graph. Gone pages include redirects and duplicates.</description>
 </property>
 
 <property>

diff --git a/src/java/org/apache/nutch/crawl/CrawlDatum.java b/src/java/org/apache/nutch/crawl/CrawlDatum.java
@@ -75,6 +75,8 @@ public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
   public static final byte STATUS_DB_DUPLICATE = 0x07;
   /** Page was marked as orphan, e.g. has no inlinks anymore */
   public static final byte STATUS_DB_ORPHAN = 0x08;
+  /** Page parsing failed */
+  public static final byte STATUS_DB_PARSE_FAILED = 0x09;
 
   /** Maximum value of DB-related status. */
   public static final byte STATUS_DB_MAX = 0x1f;
@@ -103,6 +105,8 @@ public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
   public static final byte STATUS_LINKED = 0x43;
   /** Page got metadata from a parser */
   public static final byte STATUS_PARSE_META = 0x44;
+  /** Page parse failed */
+  public static final byte STATUS_PARSE_FAILED = 0x45;
 
   public static final HashMap<Byte, String> statNames = new HashMap<>();
   static {
@@ -114,6 +118,7 @@ public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
     statNames.put(STATUS_DB_NOTMODIFIED, "db_notmodified");
     statNames.put(STATUS_DB_DUPLICATE, "db_duplicate");
     statNames.put(STATUS_DB_ORPHAN, "db_orphan");
+    statNames.put(STATUS_DB_PARSE_FAILED, "db_parse_failed");
     statNames.put(STATUS_SIGNATURE, "signature");
     statNames.put(STATUS_INJECTED, "injected");
     statNames.put(STATUS_LINKED, "linked");
@@ -124,6 +129,7 @@ public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
     statNames.put(STATUS_FETCH_GONE, "fetch_gone");
     statNames.put(STATUS_FETCH_NOTMODIFIED, "fetch_notmodified");
     statNames.put(STATUS_PARSE_META, "parse_metadata");
+    statNames.put(STATUS_PARSE_FAILED, "parse_failed");
 
     oldToNew.put(OLD_STATUS_DB_UNFETCHED, STATUS_DB_UNFETCHED);
     oldToNew.put(OLD_STATUS_DB_FETCHED, STATUS_DB_FETCHED);
@@ -144,16 +150,14 @@ public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
   private long modifiedTime;
   private org.apache.hadoop.io.MapWritable metaData;
 
+  /** Validate DB Status (ref.: CrawlDbReducer, IndexerReducer) */
   public static boolean hasDbStatus(CrawlDatum datum) {
-    if (datum.status <= STATUS_DB_MAX)
-      return true;
-    return false;
+    return (datum.status <= STATUS_DB_MAX) || CrawlDatum.STATUS_DB_PARSE_FAILED == datum.getStatus();
   }
-
+  /** Validate Fetch Status (ref.: CrawlDbReducer, IndexerReducer, SegmentMergerReducer) */
   public static boolean hasFetchStatus(CrawlDatum datum) {
-    if (datum.status > STATUS_DB_MAX && datum.status <= STATUS_FETCH_MAX)
-      return true;
-    return false;
+    return (datum.status > STATUS_DB_MAX && datum.status <= STATUS_FETCH_MAX)
+      || CrawlDatum.STATUS_PARSE_FAILED == datum.getStatus();
   }
 
   public CrawlDatum() {

diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReducer.java b/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
@@ -130,7 +130,7 @@ public void reduce(Text key, Iterable<CrawlDatum> values,
         }
         continue;
       }
-
+      
       switch (datum.getStatus()) { // collect other info
       case CrawlDatum.STATUS_LINKED:
         CrawlDatum link;
@@ -233,7 +233,7 @@ public void reduce(Text key, Iterable<CrawlDatum> values,
       }
       break;
 
-    case CrawlDatum.STATUS_FETCH_SUCCESS: // succesful fetch
+    case CrawlDatum.STATUS_FETCH_SUCCESS: // successful fetch
     case CrawlDatum.STATUS_FETCH_REDIR_TEMP: // successful fetch, redirected
     case CrawlDatum.STATUS_FETCH_REDIR_PERM:
     case CrawlDatum.STATUS_FETCH_NOTMODIFIED: // successful fetch, notmodified
@@ -320,6 +320,14 @@ public void reduce(Text key, Iterable<CrawlDatum> values,
       }
       break;
 
+    case CrawlDatum.STATUS_PARSE_FAILED: // successful fetch, but parse failed
+      if (oldSet)
+        result.setSignature(old.getSignature()); // use old signature
+      result.setStatus(CrawlDatum.STATUS_DB_PARSE_FAILED);
+      result = schedule.setPageGoneSchedule(key, result, prevFetchTime,
+          prevModifiedTime, fetch.getFetchTime());
+      break;
+
     case CrawlDatum.STATUS_FETCH_GONE: // permanent failure
       if (oldSet)
         result.setSignature(old.getSignature()); // use old signature

diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java b/src/java/org/apache/nutch/fetcher/Fetcher.java
@@ -577,11 +577,11 @@ public void fetch(Path segment, int threads) throws IOException,
     } catch (InterruptedException | ClassNotFoundException e) {
       LOG.error(StringUtils.stringifyException(e));
       throw e;
+    } finally {
+      stopWatch.stop();
+      LOG.info("Fetcher: finished, elapsed: {} ms", stopWatch.getTime(
+          TimeUnit.MILLISECONDS));
     }
-
-    stopWatch.stop();
-    LOG.info("Fetcher: finished, elapsed: {} ms", stopWatch.getTime(
-        TimeUnit.MILLISECONDS));
   }
 
   /**

diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -112,6 +112,7 @@ public class FetcherThread extends Thread {
   URLNormalizers normalizersForOutlinks;
 
   private boolean skipTruncated;
+  private boolean deleteFailedParse;
 
   private boolean halted = false;
 
@@ -181,6 +182,7 @@ public FetcherThread(Configuration conf, AtomicInteger activeThreads, FetchItemQ
     this.scfilters = new ScoringFilters(conf);
     this.parseUtil = new ParseUtil(conf);
     this.skipTruncated = conf.getBoolean(ParseSegment.SKIP_TRUNCATED, true);
+    this.deleteFailedParse = conf.getBoolean(ParseSegment.DELETE_FAILED_PARSE, false);
     this.signatureWithoutParsing = conf.getBoolean("fetcher.signature", false);
     this.protocolFactory = new ProtocolFactory(conf);
     this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER);
@@ -221,7 +223,7 @@ public FetcherThread(Configuration conf, AtomicInteger activeThreads, FetchItemQ
           .getInt("http.robots.503.defer.visits.retries", 3);
     }
 
-    if((activatePublisher=conf.getBoolean("fetcher.publisher", false)))
+    if ((activatePublisher = conf.getBoolean("fetcher.publisher", false)))
       this.publisher = new FetcherThreadPublisher(conf);
 
     queueMode = conf.get("fetcher.queue.mode",
@@ -442,7 +444,7 @@ public void run() {
             case ProtocolStatus.SUCCESS: // got a page
               pstatus = output(fit.url, fit.datum, content, status,
                   CrawlDatum.STATUS_FETCH_SUCCESS, fit.outlinkDepth);
-              updateStatus(content.getContent().length);
+              updateStatus(content.getContent() != null ? content.getContent().length : 0);
               if (pstatus != null && pstatus.isSuccess()
                   && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
                 String newUrl = pstatus.getMessage();
@@ -731,14 +733,18 @@ private ParseStatus output(Text key, CrawlDatum datum, Content content,
               .calculate(content, new ParseStatus().getEmptyParse(conf));
           datum.setSignature(signature);
         }
+
+        if (parseResult == null && parsing && deleteFailedParse) {
+          datum.setStatus(CrawlDatum.STATUS_PARSE_FAILED);
+          status = CrawlDatum.STATUS_PARSE_FAILED;
+        }
       }
 
       /*
        * Store status code in content So we can read this value during parsing
        * (as a separate job) and decide to parse or not.
        */
-      content.getMetadata().add(Nutch.FETCH_STATUS_KEY,
-          Integer.toString(status));
+      content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status));
     }
 
     try {
@@ -756,6 +762,10 @@ private ParseStatus output(Text key, CrawlDatum datum, Content content,
             LOG.warn("{} {} Error parsing: {}: {}", getName(),
                 Thread.currentThread().getId(), key, parseStatus);
             parse = parseStatus.getEmptyParse(conf);
+            if (deleteFailedParse && content != null) {
+              // forward the failure status in the content
+              content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(CrawlDatum.STATUS_PARSE_FAILED));
+            }
           }
 
           // Calculate page signature. For non-parsing fetchers this will

diff --git a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
@@ -51,6 +51,7 @@
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseSegment;
 import org.apache.nutch.parse.ParseText;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.scoring.ScoringFilterException;
@@ -206,6 +207,7 @@ public static class IndexerReducer extends
     private boolean delete = false;
     private boolean deleteRobotsNoIndex = false;
     private boolean deleteSkippedByIndexingFilter = false;
+    private boolean deleteFailedParse = false;
     private boolean base64 = false;
     private IndexingFilters filters;
     private ScoringFilters scfilters;
@@ -226,6 +228,7 @@ public static class IndexerReducer extends
     private Counter deletedGoneCounter;
     private Counter deletedRedirectsCounter;
     private Counter deletedDuplicatesCounter;
+    private Counter deletedFailedParseCounter;
     private Counter skippedNotModifiedCounter;
     private Counter deletedByIndexingFilterCounter;
     private Counter skippedByIndexingFilterCounter;
@@ -244,6 +247,7 @@ public void setup(Reducer<Text, NutchWritable, Text, NutchIndexAction>.Context c
           false);
       deleteSkippedByIndexingFilter = conf.getBoolean(INDEXER_DELETE_SKIPPED,
           false);
+      deleteFailedParse = conf.getBoolean(ParseSegment.DELETE_FAILED_PARSE, false);
       skip = conf.getBoolean(INDEXER_SKIP_NOTMODIFIED, false);
       base64 = conf.getBoolean(INDEXER_BINARY_AS_BASE64, false);
 
@@ -279,6 +283,8 @@ private void initCounters(Reducer<Text, NutchWritable, Text, NutchIndexAction>.C
           NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_DELETED_REDIRECTS_TOTAL);
       deletedDuplicatesCounter = context.getCounter(
           NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_DELETED_DUPLICATES_TOTAL);
+      deletedFailedParseCounter = context.getCounter(
+          NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_DELETED_FAILED_PARSE_TOTAL);
       skippedNotModifiedCounter = context.getCounter(
           NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_SKIPPED_NOT_MODIFIED_TOTAL);
       deletedByIndexingFilterCounter = context.getCounter(
@@ -354,6 +360,15 @@ public void reduce(Text key, Iterable<NutchWritable> values,
         }
       }
 
+      // Whether to delete pages where parsing failed
+      if (deleteFailedParse && fetchDatum != null) {
+        if (fetchDatum.getStatus() == CrawlDatum.STATUS_PARSE_FAILED
+            || dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_PARSE_FAILED) {
+          deletedFailedParseCounter.increment(1);
+          context.write(key, DELETE_ACTION);
+          return;
+        }
+      }
       // Whether to delete GONE or REDIRECTS
       if (delete && fetchDatum != null) {
         if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE

diff --git a/src/java/org/apache/nutch/metadata/Nutch.java b/src/java/org/apache/nutch/metadata/Nutch.java
@@ -26,62 +26,62 @@
  */
 public interface Nutch {
 
-	public static final String ORIGINAL_CHAR_ENCODING = "OriginalCharEncoding";
+  public static final String ORIGINAL_CHAR_ENCODING = "OriginalCharEncoding";
 
-	public static final String CHAR_ENCODING_FOR_CONVERSION = "CharEncodingForConversion";
+  public static final String CHAR_ENCODING_FOR_CONVERSION = "CharEncodingForConversion";
 
-	public static final String SIGNATURE_KEY = "nutch.content.digest";
+  public static final String SIGNATURE_KEY = "nutch.content.digest";
 
-	public static final String SEGMENT_NAME_KEY = "nutch.segment.name";
+  public static final String SEGMENT_NAME_KEY = "nutch.segment.name";
 
-	public static final String SCORE_KEY = "nutch.crawl.score";
+  public static final String SCORE_KEY = "nutch.crawl.score";
 
-	public static final String GENERATE_TIME_KEY = "_ngt_";
+  public static final String GENERATE_TIME_KEY = "_ngt_";
 
-	public static final Text WRITABLE_GENERATE_TIME_KEY = new Text(
-			GENERATE_TIME_KEY);
+  public static final Text WRITABLE_GENERATE_TIME_KEY = new Text(
+      GENERATE_TIME_KEY);
 
-	public static final Text PROTOCOL_STATUS_CODE_KEY = new Text("nutch.protocol.code");
+  public static final Text PROTOCOL_STATUS_CODE_KEY = new Text("nutch.protocol.code");
 
-	public static final String PROTO_STATUS_KEY = "_pst_";
+  public static final String PROTO_STATUS_KEY = "_pst_";
 
-	public static final Text WRITABLE_PROTO_STATUS_KEY = new Text(
-			PROTO_STATUS_KEY);
+  public static final Text WRITABLE_PROTO_STATUS_KEY = new Text(
+      PROTO_STATUS_KEY);
 
-	public static final String FETCH_TIME_KEY = "_ftk_";
+  public static final String FETCH_TIME_KEY = "_ftk_";
 
-	public static final String FETCH_STATUS_KEY = "_fst_";
+  public static final String FETCH_STATUS_KEY = "_fst_";
 
   /**
    * Name to store the <a href="https://www.robotstxt.org/meta.html">robots
    * metatag</a> in {@link org.apache.nutch.parse.ParseData}'s metadata.
    */
   public static final String ROBOTS_METATAG = "robots";
 
-	/**
-	 * Sites may request that search engines don't provide access to cached
-	 * documents.
-	 */
-	public static final String CACHING_FORBIDDEN_KEY = "caching.forbidden";
+  /**
+   * Sites may request that search engines don't provide access to cached
+   * documents.
+   */
+  public static final String CACHING_FORBIDDEN_KEY = "caching.forbidden";
 
-	/** Show both original forbidden content and summaries (default). */
-	public static final String CACHING_FORBIDDEN_NONE = "none";
+  /** Show both original forbidden content and summaries (default). */
+  public static final String CACHING_FORBIDDEN_NONE = "none";
 
-	/** Don't show either original forbidden content or summaries. */
-	public static final String CACHING_FORBIDDEN_ALL = "all";
+  /** Don't show either original forbidden content or summaries. */
+  public static final String CACHING_FORBIDDEN_ALL = "all";
 
-	/** Don't show original forbidden content, but show summaries. */
-	public static final String CACHING_FORBIDDEN_CONTENT = "content";
+  /** Don't show original forbidden content, but show summaries. */
+  public static final String CACHING_FORBIDDEN_CONTENT = "content";
 
-	public static final String REPR_URL_KEY = "_repr_";
+  public static final String REPR_URL_KEY = "_repr_";
 
-	public static final Text WRITABLE_REPR_URL_KEY = new Text(REPR_URL_KEY);
+  public static final Text WRITABLE_REPR_URL_KEY = new Text(REPR_URL_KEY);
 
-	/** Used by AdaptiveFetchSchedule to maintain custom fetch interval */
-	public static final String FIXED_INTERVAL_KEY = "fixedInterval";
+  /** Used by AdaptiveFetchSchedule to maintain custom fetch interval */
+  public static final String FIXED_INTERVAL_KEY = "fixedInterval";
 
-	public static final Text WRITABLE_FIXED_INTERVAL_KEY = new Text(
-			FIXED_INTERVAL_KEY);
+  public static final Text WRITABLE_FIXED_INTERVAL_KEY = new Text(
+      FIXED_INTERVAL_KEY);
 
 	 /** For progress of job (programmatic / tooling). */
 	public static final String STAT_PROGRESS = "progress";

diff --git a/src/java/org/apache/nutch/metrics/NutchMetrics.java b/src/java/org/apache/nutch/metrics/NutchMetrics.java
@@ -188,6 +188,9 @@ private NutchMetrics() {
   /** Documents deleted as duplicates. */
   public static final String INDEXER_DELETED_DUPLICATES_TOTAL = "deleted_duplicates_total";
 
+  /** Documents deleted because parsing failed. */
+  public static final String INDEXER_DELETED_FAILED_PARSE_TOTAL = "deleted_failed_parse_total";
+
   /** Documents deleted by indexing filter. */
   public static final String INDEXER_DELETED_BY_INDEXING_FILTER_TOTAL = "deleted_by_indexing_filter_total";