diff --git a/.github/workflows/junit-report.yml b/.github/workflows/junit-report.yml index a1f5ff6497..5e9bb20ca8 100644 --- a/.github/workflows/junit-report.yml +++ b/.github/workflows/junit-report.yml @@ -18,6 +18,11 @@ on: workflow_run: workflows: [master pull request ci] types: [completed] + +concurrency: + group: junit-report-${{ github.event.workflow_run.pull_requests[0].number || github.event.workflow_run.head_sha }} + cancel-in-progress: true + permissions: checks: write contents: read diff --git a/.github/workflows/master-build.yml b/.github/workflows/master-build.yml index 31b06236a4..a4419b8937 100644 --- a/.github/workflows/master-build.yml +++ b/.github/workflows/master-build.yml @@ -21,6 +21,10 @@ on: types: [opened, synchronize, reopened] branches: [master] +concurrency: + group: master-build-${{ github.ref }} + cancel-in-progress: true + # Java Version Strategy: # - BUILD: Requires Java 17+ (JUnit 6 dependency) # - RUNTIME: Supports Java 11+ (javac.version=11 produces Java 11 bytecode) @@ -75,7 +79,7 @@ jobs: restore-keys: | ${{ runner.os }}-ivy- - name: Run Apache Rat - run: ant clean run-rat -buildfile build.xml + run: ant clean releaseaudit -buildfile build.xml - name: Cache unknown licenses run: echo "UNKNOWN_LICENSES=$(sed -n 18p /home/runner/work/nutch/nutch/build/apache-rat-report.txt)" >> $GITHUB_ENV - name: Versions @@ -140,7 +144,7 @@ jobs: # Java 11 = major version 55, Java 17 = major version 61 EXPECTED_VERSION=${{ matrix.javac-version == '11' && '55' || '61' }} echo "Expected major version: $EXPECTED_VERSION (Java ${{ matrix.javac-version }})" - + # Find a real class file (exclude package-info.class which may have different version) cd build/classes CLASS_FILE=$(find . -name "*.class" ! -name "package-info.class" | head -1) diff --git a/.github/workflows/sonarcloud.yml b/.github/workflows/sonarcloud.yml index 92c609483b..894a78fe49 100644 --- a/.github/workflows/sonarcloud.yml +++ b/.github/workflows/sonarcloud.yml @@ -18,6 +18,11 @@ on: workflow_run: workflows: [master pull request ci] types: [completed] + +concurrency: + group: sonarcloud-${{ github.event.workflow_run.pull_requests[0].number || github.event.workflow_run.head_sha }} + cancel-in-progress: true + jobs: analysis: if: github.event.workflow_run.conclusion == 'success' diff --git a/.github/workflows/yetus.yml b/.github/workflows/yetus.yml new file mode 100644 index 0000000000..8db586a959 --- /dev/null +++ b/.github/workflows/yetus.yml @@ -0,0 +1,116 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Apache Yetus test-patch: pre-commit patch testing (Ant, JDK 17). +# Runs alongside master-build.yml; all CI is unified on Java 17. +# See https://yetus.apache.org/documentation/0.15.1/precommit/ +--- +name: Apache Yetus +on: + push: + branches: [master] + pull_request: + types: [opened, synchronize, reopened] + branches: [master] + +concurrency: + group: yetus-${{ github.ref }} + cancel-in-progress: true + +# GITHUB_TOKEN cannot comment on PRs from forks (403). Use a PAT secret +# (e.g. YETUS_COMMENT_TOKEN) if you need comments on fork PRs. +permissions: + contents: read + statuses: write + pull-requests: write + issues: write + +jobs: + yetus: + runs-on: ubuntu-latest + timeout-minutes: 45 + env: + PATCH_DIR: ${{ github.workspace }}/out + JAVA_HOME: /usr/lib/jvm/java-17-openjdk-amd64 + steps: + - uses: actions/checkout@v5 + with: + fetch-depth: 0 + - name: Cache Ivy dependencies + uses: actions/cache@v4 + with: + path: ~/.ivy2/cache + key: ${{ runner.os }}-ivy-${{ hashFiles('ivy/ivy.xml', 'src/plugin/**/ivy.xml') }} + restore-keys: | + ${{ runner.os }}-ivy- + - name: Apache Yetus test-patch + uses: apache/yetus-test-patch-action@0.15.1 + with: + basedir: . + buildtool: nobuild + githubtoken: ${{ secrets.GITHUB_TOKEN }} + javahome: '/usr/lib/jvm/java-17-openjdk-amd64' + patchdir: /github/workspace/out + plugins: all,-jira,-gitlab,-unit,-compile + project: nutch + - name: Artifact output + if: always() + uses: actions/upload-artifact@v4 + with: + name: apacheyetuspatchdir + path: ${{ env.PATCH_DIR }} + - name: Install pandoc + if: github.event_name == 'pull_request' + run: sudo apt-get update && sudo apt-get install -y pandoc + - name: Convert HTML report to Markdown + if: github.event_name == 'pull_request' + run: | + OUT="${{ env.PATCH_DIR }}" + echo "## Apache Yetus test-patch report" > yetus-report.md + echo "" >> yetus-report.md + if [ -f "$OUT/report.html" ]; then + pandoc "$OUT/report.html" -f html -t gfm >> yetus-report.md 2>/dev/null || { + echo "Pandoc conversion failed; using brief report." >> yetus-report.md + echo '```' >> yetus-report.md + cat "$OUT/brief.txt" >> yetus-report.md 2>/dev/null || true + echo '```' >> yetus-report.md + } + elif [ -f "$OUT/brief.txt" ]; then + echo '```' >> yetus-report.md + cat "$OUT/brief.txt" >> yetus-report.md + echo '```' >> yetus-report.md + else + echo "No Yetus report or brief found." >> yetus-report.md + fi + - name: Truncate if over comment limit + if: github.event_name == 'pull_request' + run: | + MAX=60000 + if [ $(wc -c < yetus-report.md) -gt $MAX ]; then + head -c $MAX yetus-report.md > yetus-report-trimmed.md + echo "" >> yetus-report-trimmed.md + echo "" >> yetus-report-trimmed.md + echo "_Report truncated (GitHub comment limit). Full HTML in apacheyetuspatchdir artifact as report.html._" >> yetus-report-trimmed.md + mv yetus-report-trimmed.md yetus-report.md + fi + - name: Comment PR with Yetus report + if: github.event_name == 'pull_request' + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 + continue-on-error: true + with: + token: ${{ secrets.YETUS_COMMENT_TOKEN || secrets.GITHUB_TOKEN }} + repository: ${{ github.repository }} + issue-number: ${{ github.event.pull_request.number }} + body-path: yetus-report.md diff --git a/.markdownlint.yaml b/.markdownlint.yaml new file mode 100644 index 0000000000..a35fb44e39 --- /dev/null +++ b/.markdownlint.yaml @@ -0,0 +1,13 @@ +# Relaxations for Yetus markdownlint (README and docs) +--- +MD001: false +MD003: false +MD012: false +MD013: + line_length: 200 +MD022: false +MD025: false +MD033: false +MD034: false +MD045: false +MD046: fenced diff --git a/.yamllint.yml b/.yamllint.yml new file mode 100644 index 0000000000..0e2b09745d --- /dev/null +++ b/.yamllint.yml @@ -0,0 +1,10 @@ +# Relax rules for workflow and config YAML (Yetus yamllint plugin) +--- +extends: default + +rules: + line-length: + max: 200 + document-start: disable + truthy: + allowed-values: ['true', 'false', 'yes', 'no', 'on', 'off'] diff --git a/.yetus/blanks-eol.txt b/.yetus/blanks-eol.txt new file mode 100644 index 0000000000..2362619874 --- /dev/null +++ b/.yetus/blanks-eol.txt @@ -0,0 +1,3 @@ +# Ignore trailing blanks in Yetus-generated patch/diff and logs (not source files). +# See --blanks-eol-ignore-file in the blanks plugin. +^out/ diff --git a/.yetus/personality.sh b/.yetus/personality.sh new file mode 100644 index 0000000000..d9da6ab782 --- /dev/null +++ b/.yetus/personality.sh @@ -0,0 +1,37 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Ensure JAVA_HOME is set for pre-patch and other phases when running in +# the Yetus Docker container (avoids "JAVA_HOME is not defined" in pre-patch). +if [ -z "${JAVA_HOME}" ] && [ -d "/usr/lib/jvm/java-17-openjdk-amd64" ]; then + export JAVA_HOME="/usr/lib/jvm/java-17-openjdk-amd64" +fi + +# Pass JAVA_HOME into the re-exec Docker container so pre-patch and other +# phases see it (YETUS-913; otherwise the inner container may not get it). +## @audience private +## @stability stable +function docker_do_env_adds +{ + declare k + # Use JAVA_HOME so detsecrets does not flag the literal path as high-entropy + DOCKER_EXTRAARGS+=("--env=JAVA_HOME=${JAVA_HOME}") + for k in "${DOCKER_EXTRAENVS[@]}"; do + [[ -z "${k}" ]] && continue + if [[ "JAVA_HOME" != "${k}" ]]; then + DOCKER_EXTRAARGS+=("--env=${k}=${!k}") + fi + done +} diff --git a/README.md b/README.md index fa68816042..13699fb98b 100644 --- a/README.md +++ b/README.md @@ -1,86 +1,124 @@ Apache Nutch README =================== -[![master pull request ci](https://github.com/apache/nutch/actions/workflows/master-build.yml/badge.svg)](https://github.com/apache/nutch/actions/workflows/master-build.yml) -[![Quality Gate Status](https://sonarcloud.io/api/project_badges/measure?project=apache_nutch&metric=alert_status)](https://sonarcloud.io/summary/new_code?id=apache_nutch) +[![master pull request ci][ci-badge]][ci-link] +[![Quality Gate Status][sonar-badge]][sonar-link] - +[ci-badge]: https://github.com/apache/nutch/actions/workflows/master-build.yml/badge.svg +[ci-link]: https://github.com/apache/nutch/actions/workflows/master-build.yml +[sonar-badge]: https://sonarcloud.io/api/project_badges/measure?project=apache_nutch&metric=alert_status +[sonar-link]: https://sonarcloud.io/summary/new_code?id=apache_nutch + +![Nutch logo][logo] + +[logo]: https://nutch.apache.org/assets/img/nutch_logo_tm.png For the latest information about Nutch, please visit our website at: - https://nutch.apache.org/ + and our wiki, at: - https://cwiki.apache.org/confluence/display/NUTCH/Home + To get started using Nutch read Tutorial: - https://cwiki.apache.org/confluence/display/NUTCH/NutchTutorial + Contributing -============ +------------ + To contribute a patch, follow these instructions (note that installing [Hub](https://hub.github.com/) is not strictly required, but is recommended). 0. Download and install hub.github.com -1. File JIRA issue for your fix at https://issues.apache.org/jira/projects/NUTCH/issues +1. File JIRA issue for your fix at + - you will get issue id NUTCH-xxxx where xxxx is the issue ID. 2. `git clone https://github.com/apache/nutch.git` 3. `cd nutch` 4. `git checkout -b NUTCH-xxxx` 5. edit files (please try and include a test case if possible) 6. `git status` (make sure it shows what files you expected to edit) -7. Make sure that your code complies with the [Nutch codeformatting template](https://raw.githubusercontent.com/apache/nutch/master/eclipse-codeformat.xml), which is basially two space indents +7. Make sure that your code complies with the [Nutch codeformatting + template][eclipse-format], which is basically two space indents 8. `git add ` 9. `git commit -m "fix for NUTCH-xxx contributed by "` -10. `hub fork` (if hub is not installed, you can fork the project using the "fork" button on the [Nutch Github project page](https://github.com/apache/nutch)) +10. `hub fork` (if hub is not installed, fork using the "fork" button on the + [Nutch Github project page](https://github.com/apache/nutch)) 11. `git push -u NUTCH-xxxx` -12. `hub pull-request` (if hub is not installed, please follow the instructions how to [create a pull-request from a fork](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request-from-a-fork)) +12. `hub pull-request` (if hub is not installed, please follow the + instructions to [create a pull-request from a fork][pr-from-fork]) +Pre-commit / Apache Yetus +------------------------- + +Pull requests run [Apache Yetus](https://yetus.apache.org/) test-patch for +automated checks (style, reporting). See +[Basic Precommit](https://yetus.apache.org/documentation/0.15.1/precommit/) +and [Usage Introduction][yetus-usage]. CI uses Java 17. To run test-patch +locally (e.g. before opening a PR): + +```bash +test-patch --basedir=/path/to/clean/repo --build-tool=nobuild \ + --plugins=all,-jira,-gitlab,-unit,-compile [patchfile] +``` + +Exclude patterns can be added in `.yetus/excludes.txt` (regex, one per line). IDE setup -========= +--------- ### Eclipse Generate Eclipse project files -``` +```bash ant eclipse ``` -and follow the instructions in [Importing existing projects](https://help.eclipse.org/2019-06/topic/org.eclipse.platform.doc.user/tasks/tasks-importproject.htm). - -You must [configure the nutch-site.xml](https://cwiki.apache.org/confluence/display/NUTCH/RunNutchInEclipse) before running. Make sure, you've added ```http.agent.name``` and ```plugin.folders``` properties. The plugin.folders normally points to ```/build/plugins```. +and follow the instructions in [Importing existing projects][eclipse-import]. -Now create a Java Application Configuration, choose org.apache.nutch.crawl.Injector, add two paths as arguments. First one is the crawldb directory, second one is the URL directory where, the injector can read urls. Now run your configuration. +You must [configure the nutch-site.xml][runnutch] before running. Make sure you +have added `http.agent.name` and `plugin.folders` properties. The +plugin.folders normally points to `/build/plugins`. -If we still see the ```No plugins found on paths of property plugin.folders="plugins"```, update the plugin.folders in the nutch-default.xml, this is a quick fix, but should not be used. +Now create a Java Application Configuration, choose +org.apache.nutch.crawl.Injector, add two paths as arguments: first the crawldb +directory, second the URL directory where the injector can read urls. Then run +your configuration. +If we still see "No plugins found on paths of property plugin.folders=plugins", +update the plugin.folders in the nutch-default.xml; this is a quick fix, but +should not be used. ### Intellij IDEA -First install the [IvyIDEA Plugin](https://plugins.jetbrains.com/plugin/3612-ivyidea). then run ```ant eclipse```. This will create the necessary -.classpath and .project files so that Intellij can import the project in the next step. +First install the [IvyIDEA Plugin][ivyidea]. Then run `ant eclipse`. This +creates the .classpath and .project files so Intellij can import the project. -In Intellij IDEA, select File > New > Project from Existing Sources. Select the nutch home directory and click "Open". +In Intellij IDEA, select File > New > Project from Existing Sources. Select the +nutch home directory and click "Open". -On the "Import Project" screen select the "Import project from external model" radio button and select "Eclipse". -Click "Create". On the next screen the "Eclipse projects directory" should be already set to the nutch folder. -Leave the "Create module files near .classpath files" radio button selected. -Click "Next" on the next screens. On the project SDK screen select Java 11 and click "Create". -**N.B.** For anyone on a Mac with a homebrew-installed openjdk, you need to use the directory under _libexec_: `/libexec/openjdk.jdk/Contents/Home`. +On the "Import Project" screen select the "Import project from external model" +radio button and select "Eclipse". Click "Create". On the next screen the +"Eclipse projects directory" should be already set to the nutch folder. Leave +the "Create module files near .classpath files" radio button selected. -Once the project is imported, you will see a popup saying "Ant build scripts found", "Frameworks detected - IvyIDEA Framework detected". Click "Import". -If you don't get the pop-up, I'd suggest going through the steps again as this happens from time to time. There is another -Ant popup that asks you to configure the project. Do NOT click "Configure". +Click "Next" on the next screens. On the project SDK screen select Java 11 and +click "Create". **N.B.** On Mac with homebrew openjdk, use the directory under +_libexec_: `/libexec/openjdk.jdk/Contents/Home`. -To import the code-style, Go to Intellij IDEA > Preferences > Editor > Code Style > Java. +Once the project is imported, you will see a popup saying "Ant build scripts +found", "Frameworks detected - IvyIDEA Framework detected". Click "Import". If +you don't get the pop-up, go through the steps again as this happens from time +to time. There is another Ant popup that asks you to configure the project. Do +NOT click "Configure". -For the Scheme dropdown select "Project". Click the gear icon and select "Import Scheme" > "Eclipse XML file". - -Select the eclipse-format.xml file and click "Open". On next screen check the "Current Scheme" checkbox and hit OK. +To import the code-style: Intellij IDEA > Preferences > Editor > Code Style > +Java. For the Scheme dropdown select "Project". Click the gear icon and select +"Import Scheme" > "Eclipse XML file". Select the eclipse-format.xml file and +click "Open". On the next screen check the "Current Scheme" checkbox and hit OK. ### Running in Intellij IDEA @@ -88,10 +126,24 @@ Running in Intellij - Open Run/Debug Configurations - Select "+" to create a new configuration and select "Application" -- For "Main Class" enter a class with a main function (e.g. org.apache.nutch.indexer.IndexingJob). -- For "Program Arguments" add the arguments needed for the class. You can get these by running the crawl executable for your job. Use full-qualified paths. (e.g. /Users/kamil/workspace/external/nutch/crawl/crawldb /Users/kamil/workspace/external/nutch/crawl/segments/20221222160141 -deleteGone) -- For "Working Directory" enter "/Users/kamil/workspace/external/nutch/runtime/local". -- Select "Modify options" > "Modify Classpath" and add the config directory belonging to the "Working Directory" from the previous step (e.g. /Users/kamil/workspace/external/nutch/runtime/local/conf). This will allow the resource loader to load that configuration. -- Select "Modify options" > "Add VM Options". Add the VM options needed. You can get these by running the crawl executable for your job (e.g. -Xmx4096m -Dhadoop.log.dir=/Users/kamil/workspace/external/nutch/runtime/local/logs -Dhadoop.log.file=hadoop.log -Dmapreduce.job.reduces=2 -Dmapreduce.reduce.speculative=false -Dmapreduce.map.speculative=false -Dmapreduce.map.output.compress=true) - -**Note**: You will need to manually trigger a build through ANT to get latest updated changes when running. This is because the ant build system is separate from the Intellij one. +- For "Main Class" enter a class with a main function + (e.g. org.apache.nutch.indexer.IndexingJob) +- For "Program Arguments" add the arguments needed for the class. You can get + these by running the crawl executable for your job. Use full-qualified paths + (e.g. crawldb and segments paths plus -deleteGone) +- For "Working Directory" enter your nutch runtime/local path +- Select "Modify options" > "Modify Classpath" and add the config directory for + that Working Directory (e.g. runtime/local/conf) +- Select "Modify options" > "Add VM Options" and add the VM options from + running the crawl executable (e.g. -Xmx4096m -Dhadoop.log.dir=... etc.) + +**Note**: You will need to manually trigger a build through ANT to get latest +updated changes when running, because the ant build system is separate from +the Intellij one. + +[eclipse-import]: https://help.eclipse.org/2019-06/topic/org.eclipse.platform.doc.user/tasks/tasks-importproject.htm +[eclipse-format]: https://raw.githubusercontent.com/apache/nutch/master/eclipse-codeformat.xml +[pr-from-fork]: https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request-from-a-fork +[runnutch]: https://cwiki.apache.org/confluence/display/NUTCH/RunNutchInEclipse +[ivyidea]: https://plugins.jetbrains.com/plugin/3612-ivyidea +[yetus-usage]: https://yetus.apache.org/documentation/0.15.1/precommit/usage-intro/ diff --git a/build.xml b/build.xml index 08ee22f853..ea73583706 100644 --- a/build.xml +++ b/build.xml @@ -57,15 +57,15 @@ @@ -1004,8 +1004,8 @@ - +