conf/nutch-site.xml
urls/seed.txt
https://mp4.okdevtv.com/
take ~/dev
wget https://dlcdn.apache.org/lucene/solr/8.11.4/solr-8.11.4.tgz
tar xvfz solr-8.11.4.tgz
conf
directorywhitelist
java -cp lib/apache-nutch-1.20.jar:runtime/local/lib/hadoop-core-1.2.0.jar:runtime/local/lib/crawler-commons-0.6.jar:runtime/local/lib/slf4j-log4j12-1.7.5.jar:runtime/local/lib/slf4j-api-1.7.9.jar:runtime/local/lib/log4j-1.2.17.jar:runtime/local/lib/guava-16.0.1.jar:runtime/local/lib/commons-logging-1.1.3.jar:runtime/local/lib/commons-cli-1.2.jar org.apache.nutch.protocol.RobotRulesParser robots.txt url Nutch-crawler