diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index ffb07a6c91..cc0e8d4388 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1606,7 +1606,10 @@
plugins
Directories where Nutch plugins are located. Each
element may be a relative or absolute path. If absolute, it is used
- as is. If relative, it is searched for on the classpath.
+ as is. If relative, it is searched for on the classpath.
+ For secure deployments, treat these directories as trusted code: use
+ read-only filesystem permissions or immutable images so untrusted
+ parties cannot add or replace plugin JARs or plugin.xml files.
@@ -2146,6 +2149,19 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
each property value is always an array of Strings (so if you expect one value, use [0])
* doc - contains all the NutchFields from the NutchDocument.
each property value is always an array of Objects.
+ Expressions are evaluated in a sandboxed JEXL engine (see also
+ nutch.jexl.disable.sandbox).
+
+
+
+
+ nutch.jexl.disable.sandbox
+ false
+ If true, disables the Commons JEXL sandbox and the restriction
+ on the JEXL "new" operator for all Nutch JEXL expressions (index filter,
+ generator, hostdb filter, crawl_db_reader, exchange-jexl, etc.). This is
+ unsafe and should only be used in fully trusted environments when a
+ legitimate expression cannot be expressed under the default sandbox.
diff --git a/docker/README.md b/docker/README.md
index 80e1a1d6d9..720fdf8165 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -56,6 +56,12 @@ $(boot2docker shellinit | grep export) #may not be necessary
docker build -t apache/nutch . --build-arg BUILD_MODE=2 --build-arg SERVER_PORT=8081 --build-arg SERVER_HOST=0.0.0.0 --build-arg WEBAPP_PORT=8080
```
+## Security and plugin directories
+
+Nutch loads executable code from the directories configured as `plugin.folders` (see `nutch-default.xml`). For production and shared images, treat those paths as **trusted**: mount them read-only where possible, rebuild images to change plugins, and run the crawl process under a dedicated low-privilege user so the filesystem cannot be abused to drop unexpected JARs or `plugin.xml` files into that tree.
+
+User-defined JEXL in configuration (for example `index.jexl.filter`, generator expressions, and `hostdb.filter.expression`) is evaluated in a **sandboxed** engine by default. The property `nutch.jexl.disable.sandbox` disables that protection and must not be set in untrusted environments.
+
## Usage
If not already running, start docker
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
index 03cf0fbd39..57e684374c 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
@@ -906,7 +906,7 @@ public void setup(
retry = config.getInt("retry", -1);
if (config.get("expr", null) != null) {
- expr = JexlUtil.parseExpression(config.get("expr", null));
+ expr = JexlUtil.parseExpression(config, config.get("expr", null));
}
sample = config.getFloat("sample", 1);
}
diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java
index 102ce39b94..aa8cfcbbfa 100644
--- a/src/java/org/apache/nutch/crawl/Generator.java
+++ b/src/java/org/apache/nutch/crawl/Generator.java
@@ -227,7 +227,7 @@ public void setup(
if (!restrictStatusString.isEmpty()) {
restrictStatus = CrawlDatum.getStatusByName(restrictStatusString);
}
- expr = JexlUtil.parseExpression(conf.get(GENERATOR_EXPR, null));
+ expr = JexlUtil.parseExpression(conf, conf.get(GENERATOR_EXPR, null));
// Initialize error tracker with cached counters
errorTracker = new ErrorTracker(NutchMetrics.GROUP_GENERATOR, context);
// Initialize cached counter references
@@ -453,10 +453,10 @@ public void setup(Context context) throws IOException {
URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
if (conf.get(GENERATOR_HOSTDB) != null) {
- maxCountExpr = JexlUtil
- .parseExpression(conf.get(GENERATOR_MAX_COUNT_EXPR, null));
- fetchDelayExpr = JexlUtil
- .parseExpression(conf.get(GENERATOR_FETCH_DELAY_EXPR, null));
+ maxCountExpr = JexlUtil.parseExpression(conf,
+ conf.get(GENERATOR_MAX_COUNT_EXPR, null));
+ fetchDelayExpr = JexlUtil.parseExpression(conf,
+ conf.get(GENERATOR_FETCH_DELAY_EXPR, null));
}
// Initialize error tracker with cached counters
errorTracker = new ErrorTracker(NutchMetrics.GROUP_GENERATOR, context);
@@ -871,7 +871,7 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
* maximum number of segments to generate
* @param expr
* a Jexl expression to use in the Generator job.
- * @see JexlUtil#parseExpression(String)
+ * @see JexlUtil#parseExpression(Configuration, String)
* @throws IOException
* if an I/O exception occurs.
* @see LockUtil#createLockFile(Configuration, Path, boolean)
@@ -922,7 +922,7 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
* @param hostdb
* name of a hostdb from which to execute Jexl expressions in a bid
* to determine the maximum URL count and/or fetch delay per host.
- * @see JexlUtil#parseExpression(String)
+ * @see JexlUtil#parseExpression(Configuration, String)
* @throws IOException
* if an I/O exception occurs.
* @see LockUtil#createLockFile(Configuration, Path, boolean)
diff --git a/src/java/org/apache/nutch/hostdb/ReadHostDb.java b/src/java/org/apache/nutch/hostdb/ReadHostDb.java
index 9f2e4a384e..23d94bc881 100644
--- a/src/java/org/apache/nutch/hostdb/ReadHostDb.java
+++ b/src/java/org/apache/nutch/hostdb/ReadHostDb.java
@@ -44,11 +44,10 @@
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.SegmentReaderUtil;
-import org.apache.commons.jexl3.JexlBuilder;
import org.apache.commons.jexl3.JexlContext;
import org.apache.commons.jexl3.JexlScript;
-import org.apache.commons.jexl3.JexlEngine;
import org.apache.commons.jexl3.MapContext;
+import org.apache.nutch.util.JexlUtil;
/**
* @see Commons
@@ -77,11 +76,7 @@ public void setup(Context context) {
fieldHeader = context.getConfiguration().getBoolean(HOSTDB_DUMP_HEADER, true);
String expr = context.getConfiguration().get(HOSTDB_FILTER_EXPRESSION);
if (expr != null) {
- // Create or retrieve a JexlEngine
- JexlEngine jexl = new JexlBuilder().silent(true).strict(true).create();
-
- // Create an expression object
- this.expr = jexl.createScript(expr);
+ this.expr = JexlUtil.parseExpression(context.getConfiguration(), expr);
}
}
diff --git a/src/java/org/apache/nutch/util/JexlUtil.java b/src/java/org/apache/nutch/util/JexlUtil.java
index 549aebc419..29e8a4f204 100644
--- a/src/java/org/apache/nutch/util/JexlUtil.java
+++ b/src/java/org/apache/nutch/util/JexlUtil.java
@@ -23,33 +23,159 @@
import org.apache.commons.jexl3.JexlBuilder;
import org.apache.commons.jexl3.JexlEngine;
+import org.apache.commons.jexl3.JexlFeatures;
import org.apache.commons.jexl3.JexlScript;
+import org.apache.commons.jexl3.introspection.JexlSandbox;
import org.apache.commons.lang3.time.DateUtils;
+import org.apache.hadoop.conf.Configuration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
- * Utility methods for handling JEXL expressions
+ * Utility methods for handling JEXL expressions used in crawl and index
+ * pipelines. Expressions are evaluated under a {@link JexlSandbox} with
+ * {@link JexlFeatures#newInstance(boolean)} disabled so arbitrary classes cannot
+ * be instantiated from user-supplied configuration.
*/
public class JexlUtil {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
+ /**
+ * When {@code true}, JEXL parsing skips the sandbox (unsafe). For trusted
+ * environments only; not recommended.
+ */
+ public static final String DISABLE_SANDBOX_KEY = "nutch.jexl.disable.sandbox";
+
/** Supported format for date parsing yyyy-MM-ddTHH:mm:ssZ */
- private static final Pattern DATE_PATTERN = Pattern.compile("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z");
+ private static final Pattern DATE_PATTERN = Pattern
+ .compile("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z");
+
+ /**
+ * Classes and interfaces that may be introspected when evaluating Nutch JEXL
+ * scripts. Default-deny sandbox: anything not listed is blocked.
+ */
+ private static final String[] SANDBOX_ALLOW_CLASSES = {
+ "java.lang.String",
+ "java.lang.Boolean",
+ "java.lang.Byte",
+ "java.lang.Character",
+ "java.lang.Short",
+ "java.lang.Integer",
+ "java.lang.Long",
+ "java.lang.Float",
+ "java.lang.Double",
+ "java.lang.Number",
+ "java.lang.Math",
+ "java.lang.Comparable",
+ "java.lang.CharSequence",
+ "java.util.Map",
+ "java.util.List",
+ "java.util.Collection",
+ "java.util.Set",
+ "java.util.SortedMap",
+ "java.util.SortedSet",
+ "java.util.Iterator",
+ "java.lang.Iterable",
+ "java.util.AbstractList",
+ "java.util.AbstractCollection",
+ "java.util.AbstractMap",
+ "java.util.AbstractSet",
+ "java.util.ArrayList",
+ "java.util.LinkedList",
+ "java.util.HashMap",
+ "java.util.LinkedHashMap",
+ "java.util.HashSet",
+ "java.util.LinkedHashSet",
+ "java.util.TreeMap",
+ "java.util.TreeSet",
+ "java.util.Collections",
+ "java.util.Arrays",
+ "java.util.regex.Pattern",
+ "java.util.regex.Matcher",
+ "org.apache.commons.jexl3.MapContext",
+ "org.apache.nutch.indexer.NutchDocument",
+ "org.apache.nutch.indexer.NutchField",
+ };
+
+ private static volatile JexlEngine sandboxedEngine;
+ private static volatile JexlEngine legacyEngine;
+
+ private JexlUtil() {
+ }
+
+ private static JexlSandbox createSandbox() {
+ JexlSandbox sandbox = new JexlSandbox(false);
+ for (String name : SANDBOX_ALLOW_CLASSES) {
+ sandbox.allow(name);
+ }
+ return sandbox;
+ }
+
+ private static JexlFeatures createFeatures() {
+ return new JexlFeatures(JexlFeatures.createDefault()).newInstance(false);
+ }
+
+ private static JexlEngine getSandboxedEngine() {
+ if (sandboxedEngine == null) {
+ synchronized (JexlUtil.class) {
+ if (sandboxedEngine == null) {
+ sandboxedEngine = new JexlBuilder().silent(true).strict(true)
+ .sandbox(createSandbox()).features(createFeatures()).create();
+ }
+ }
+ }
+ return sandboxedEngine;
+ }
+
+ private static JexlEngine getLegacyEngine() {
+ if (legacyEngine == null) {
+ synchronized (JexlUtil.class) {
+ if (legacyEngine == null) {
+ legacyEngine = new JexlBuilder().silent(true).strict(true).create();
+ }
+ }
+ }
+ return legacyEngine;
+ }
+
+ private static JexlEngine engineFor(Configuration conf) {
+ if (conf != null && conf.getBoolean(DISABLE_SANDBOX_KEY, false)) {
+ LOG.warn("{}=true: JEXL sandbox is disabled; only use in fully trusted environments.",
+ DISABLE_SANDBOX_KEY);
+ return getLegacyEngine();
+ }
+ return getSandboxedEngine();
+ }
/**
- * Parses the given expression to a JEXL expression. This supports
- * date parsing.
+ * Parses a JEXL expression using the default (sandboxed) engine. Use
+ * {@link #parseExpression(Configuration, String)} when a {@link Configuration}
+ * is available so {@link #DISABLE_SANDBOX_KEY} can be honored.
*
* @param expr string JEXL expression
* @return parsed JEXL expression or null in case of parse error
*/
public static JexlScript parseExpression(String expr) {
- if (expr == null) return null;
-
+ return parseExpression(null, expr);
+ }
+
+ /**
+ * Parses a JEXL expression. Unless {@link #DISABLE_SANDBOX_KEY} is set to
+ * {@code true} in {@code conf}, the expression is parsed for execution under
+ * a restrictive sandbox.
+ *
+ * @param conf Hadoop configuration, or null to always use the sandbox
+ * @param expr string JEXL expression
+ * @return parsed JEXL expression or null in case of parse error
+ */
+ public static JexlScript parseExpression(Configuration conf, String expr) {
+ if (expr == null) {
+ return null;
+ }
+
try {
// Translate any date object into a long. Dates must be in the DATE_PATTERN
// format. For example: 2016-03-20T00:00:00Z
@@ -57,22 +183,21 @@ public static JexlScript parseExpression(String expr) {
if (matcher.find()) {
String date = matcher.group();
-
+
// parse the matched substring and get the epoch
- Date parsedDate = DateUtils.parseDateStrictly(date, new String[] {"yyyy-MM-dd'T'HH:mm:ss'Z'"});
+ Date parsedDate = DateUtils.parseDateStrictly(date,
+ new String[] { "yyyy-MM-dd'T'HH:mm:ss'Z'" });
long time = parsedDate.getTime();
-
+
// replace the original string date with the numeric value
expr = expr.replace(date, Long.toString(time));
}
- JexlEngine jexl = new JexlBuilder().silent(true).strict(true).create();
-
- return jexl.createScript(expr);
+ return engineFor(conf).createScript(expr);
} catch (Exception e) {
LOG.error(e.getMessage());
}
-
+
return null;
}
}
diff --git a/src/plugin/exchange-jexl/src/java/org/apache/nutch/exchange/jexl/JexlExchange.java b/src/plugin/exchange-jexl/src/java/org/apache/nutch/exchange/jexl/JexlExchange.java
index a55557595d..be6bf0dbe8 100644
--- a/src/plugin/exchange-jexl/src/java/org/apache/nutch/exchange/jexl/JexlExchange.java
+++ b/src/plugin/exchange-jexl/src/java/org/apache/nutch/exchange/jexl/JexlExchange.java
@@ -41,7 +41,8 @@ public class JexlExchange implements Exchange {
*/
@Override
public void open(Map parameters) {
- expression = JexlUtil.parseExpression(parameters.get(EXPRESSION_KEY));
+ expression = JexlUtil.parseExpression(getConf(),
+ parameters.get(EXPRESSION_KEY));
}
/**
diff --git a/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/JexlIndexingFilter.java b/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/JexlIndexingFilter.java
index e1fa792951..a89be63826 100644
--- a/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/JexlIndexingFilter.java
+++ b/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/JexlIndexingFilter.java
@@ -114,7 +114,7 @@ public void setConf(Configuration conf) {
"The property index.jexl.filter must have a value when index-jexl-filter is used. You can use 'true' or 'false' to index all/none");
}
- expr = JexlUtil.parseExpression(strExpr);
+ expr = JexlUtil.parseExpression(conf, strExpr);
if (expr == null) {
LOG.error("Failed parsing JEXL from index.jexl.filter: {}", strExpr);
diff --git a/src/test/org/apache/nutch/util/TestJexlUtil.java b/src/test/org/apache/nutch/util/TestJexlUtil.java
new file mode 100644
index 0000000000..221fffea22
--- /dev/null
+++ b/src/test/org/apache/nutch/util/TestJexlUtil.java
@@ -0,0 +1,160 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.util;
+
+import org.apache.commons.jexl3.JexlScript;
+import org.apache.commons.jexl3.MapContext;
+import org.apache.hadoop.conf.Configuration;
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+/**
+ * Unit tests for {@link JexlUtil} sandboxing.
+ */
+public class TestJexlUtil {
+
+ @Test
+ public void testSandboxAllowsDocFieldCompare() throws Exception {
+ JexlScript script = JexlUtil.parseExpression("doc.lang == 'en'");
+ assertNotNull(script);
+ MapContext doc = new MapContext();
+ doc.set("lang", "en");
+ MapContext root = new MapContext();
+ root.set("doc", doc);
+ assertTrue(Boolean.TRUE.equals(script.execute(root)));
+ }
+
+ @Test
+ public void testSandboxAllowsScoreCompare() throws Exception {
+ JexlScript script = JexlUtil.parseExpression("score > 0.5");
+ assertNotNull(script);
+ MapContext root = new MapContext();
+ root.set("score", 0.9f);
+ assertTrue(Boolean.TRUE.equals(script.execute(root)));
+ }
+
+ @Test
+ public void testNewInstanceIoBlocked() {
+ assertNull(JexlUtil.parseExpression("new java.io.File('/')"));
+ }
+
+ @Test
+ public void testNewInstanceFileOutputStreamBlocked() {
+ assertNull(JexlUtil.parseExpression(
+ "new java.io.FileOutputStream('/tmp/nutch-jexl-poc')"));
+ }
+
+ @Test
+ public void testDisableSandboxAllowsNewExpressionParse() {
+ Configuration conf = new Configuration();
+ conf.setBoolean(JexlUtil.DISABLE_SANDBOX_KEY, true);
+ JexlScript script = JexlUtil.parseExpression(conf,
+ "new java.io.File('/')");
+ assertNotNull(script);
+ }
+
+ @Test
+ public void testArithmeticAllowed() throws Exception {
+ JexlScript script = JexlUtil.parseExpression("2 * 3 + 1 == 7");
+ assertNotNull(script);
+ assertTrue(Boolean.TRUE.equals(script.execute(new MapContext())));
+ }
+
+ @Test
+ public void testStringMethodsAllowed() throws Exception {
+ JexlScript script = JexlUtil.parseExpression(
+ "url.startsWith('http://')");
+ assertNotNull(script);
+ MapContext root = new MapContext();
+ root.set("url", "http://example.org/");
+ assertTrue(Boolean.TRUE.equals(script.execute(root)));
+ }
+
+ @Test
+ public void testDateRewriteStillParses() {
+ JexlScript script = JexlUtil.parseExpression(
+ "fetchTime > 2016-03-20T00:00:00Z");
+ assertNotNull(script);
+ }
+
+ @Test
+ public void testNullExpression() {
+ assertNull(JexlUtil.parseExpression(null));
+ assertNull(JexlUtil.parseExpression(new Configuration(), null));
+ }
+
+ @Test
+ public void testInvalidSyntaxReturnsNull() {
+ assertNull(JexlUtil.parseExpression("doc.lang=<>:='en'"));
+ }
+
+ @Test
+ public void testListSize() throws Exception {
+ JexlScript script = JexlUtil.parseExpression("doc.tags.size() == 2");
+ assertNotNull(script);
+ MapContext doc = new MapContext();
+ java.util.List tags = new java.util.ArrayList<>();
+ tags.add("a");
+ tags.add("b");
+ doc.set("tags", tags);
+ MapContext root = new MapContext();
+ root.set("doc", doc);
+ assertTrue(Boolean.TRUE.equals(script.execute(root)));
+ }
+
+ @Test
+ public void testGeneratorStyleMetadata() throws Exception {
+ JexlScript script = JexlUtil.parseExpression(
+ "warc_import_time > 0 && score > 0");
+ assertNotNull(script);
+ MapContext root = new MapContext();
+ root.set("warc_import_time", 1);
+ root.set("score", 1.0f);
+ assertTrue(Boolean.TRUE.equals(script.execute(root)));
+ }
+
+ @Test
+ public void testEqualsIgnoreCase() throws Exception {
+ JexlScript script = JexlUtil.parseExpression(
+ "status.equalsIgnoreCase('FETCHED')");
+ assertNotNull(script);
+ MapContext root = new MapContext();
+ root.set("status", "fetched");
+ assertTrue(Boolean.TRUE.equals(script.execute(root)));
+ }
+
+ @Test
+ public void testRegex() throws Exception {
+ JexlScript script = JexlUtil.parseExpression(
+ "url =~ 'https?://.*\\.example\\.org/.*'");
+ assertNotNull(script);
+ MapContext root = new MapContext();
+ root.set("url", "http://foo.example.org/bar");
+ assertTrue(Boolean.TRUE.equals(script.execute(root)));
+ }
+
+ @Test
+ public void testTernary() throws Exception {
+ JexlScript script = JexlUtil.parseExpression("true ? 1 : 0");
+ assertNotNull(script);
+ assertEquals(1, script.execute(new MapContext()));
+ }
+}