take ~/dev
wget https://dlcdn.apache.org/nutch/1.20/apache-nutch-1.20-bin.tar.gz
tar xvfz apache-nutch-1.20-bin.tar.gz
conf/nutch-site.xml
<property>
<name>http.agent.name</name>
<value>My Nutch Spider</value>
</property>
bin/nutch parsechecker -dumpText https://mp4.okdevtv.com
urls/seed.txt
https://mp4.okdevtv.com/
bin/nutch inject crawl/crawldb urls
bin/nutch generate crawl/crawldb crawl/segments
ls -d crawl/segments/2*
s1=`ls -d crawl/segments/2* | tail -1`
echo $s1
bin/nutch fetch $s1
bin/nutch parse $s1
bin/nutch updatedb crawl/crawldb $s1
bin/nutch generate crawl/crawldb crawl/segments -topN 1000
s2=`ls -d crawl/segments/2* | tail -1`
echo $s2
bin/nutch fetch $s2
bin/nutch parse $s2
bin/nutch updatedb crawl/crawldb $s2
bin/nutch generate crawl/crawldb crawl/segments -topN 1000
s3=`ls -d crawl/segments/2* | tail -1`
echo $s3
bin/nutch fetch $s3
bin/nutch parse $s3
bin/nutch updatedb crawl/crawldb $s3
bin/nutch invertlinks crawl/linkdb -dir crawl/segments
take ~/dev
wget https://dlcdn.apache.org/lucene/solr/8.11.4/solr-8.11.4.tgz
tar xvfz solr-8.11.4.tgz
cd ~/dev/solr-8.11.4
mkdir -p ./server/solr/configsets/nutch/
cp -r ./server/solr/configsets/_default/* ./server/solr/configsets/nutch/
conf
 directorycp ../apache-nutch-1.20/plugins/indexer-solr/schema.xml ./server/solr/configsets/nutch/conf/
bin/solr start
bin/solr create -c odevtube -d ./server/solr/configsets/nutch/conf/
cd ~/dev/apache-nutch-1.20/
bin/nutch index crawl/crawldb/ -linkdb crawl/linkdb/ crawl/segments/20131108063838/ -filter -normalize -deleteGone
whitelist
java -cp lib/apache-nutch-1.20.jar:runtime/local/lib/hadoop-core-1.2.0.jar:runtime/local/lib/crawler-commons-0.6.jar:runtime/local/lib/slf4j-log4j12-1.7.5.jar:runtime/local/lib/slf4j-api-1.7.9.jar:runtime/local/lib/log4j-1.2.17.jar:runtime/local/lib/guava-16.0.1.jar:runtime/local/lib/commons-logging-1.1.3.jar:runtime/local/lib/commons-cli-1.2.jar org.apache.nutch.protocol.RobotRulesParser robots.txt url Nutch-crawler