关键词:
require 'simple_worker'
require 'eventmachine'
require 'em-http-request'
require 'nokogiri'
require 'aws'
require 'redis'
class RedEx < SimpleWorker::Base
merge_gem 'em-redis'
merge_gem 'happening'
merge_gem 'utf8_utils'
merge_gem 'aws'
merge_gem 'crewait'
unmerge '../models/domain.rb'
unmerge '../models/project.rb'
unmerge '../models/company.rb'
unmerge '../models/plan.rb'
unmerge '../models/link.rb'
unmerge '../models/page.rb'
unmerge '../models/page_link.rb'
merge "../../lib/query_builder.rb"
merge "../../lib/google_base_em.rb"
merge "../../lib/google_em.rb"
merge "../../lib/indexation_stats_mixin.rb"
merge "../../lib/yahoo_stats_mixin.rb"
merge "../../lib/bing_stats_mixin.rb"
merge "../../lib/semrush_stats_mixin.rb"
merge "../../lib/social_stats_mixin.rb"
merge "../../lib/nokogiri_doc_parser_mixin.rb"
merge "../../lib/url_validator.rb"
merge "../../lib/url_cleaner.rb"
merge "../../lib/crawler_helper/db.rb"
merge "../../lib/crawler_helper/link.rb"
merge "../../lib/cloud_crawler_helper/page_processor.rb"
merge "../../lib/cloud_crawler_helper/found_link_adder.rb"
merge "../../lib/cloud_crawler_helper/page_stat2.rb"
merge "../../lib/cloud_crawler_helper/crawler_red_ex_storage_mixin.rb"
merge "../../lib/cloud_crawler_helper/red_work_queuer.rb"
merge "../../lib/market_fu/calculator.rb"
merge "../../lib/cloud_link_helper/redis_mixin.rb"
merge "../../lib/cloud_crawler_helper/red_ex_page.rb"
merge "../models/cloud_crawler_found_link.rb"
merge "../models/cloud_crawler.rb"
merge "../models/cloud_domain.rb"
merge "../models/cloud_crawler_url.rb"
merge "../models/cloud_page.rb"
merge "redex_page_processor.rb"
merge '../models/clean/domain.rb'
merge '../models/clean/project.rb'
merge '../models/clean/company.rb'
merge '../models/clean/plan.rb'
S3_ACCESS_KEY = 'TUSA'
S3_SECRET_KEY = 'y/XhpORF1vqrxOecHj'
DO_NOT_CRAWL_TYPES = %w(.pdf .doc .xls .ppt .mp3 .mp4 .m4v .avi .mpg .rss .xml .json .txt .git .zip .md5 .asc .jpg .jpeg .gif .png)
CONCURRENT_CONNECTIONS = 50
SIMULTANEOUS_DB_CONNECTIONS = 20
REDIS_OPTIONS_HASH = :host => "ikefish.redistogo.com", :port => 9065, :password => "360c4b698d", :thread_safe => true
VERBOSE = true
attr_accessor :domain_id, :a, :r, :visit_key, :queued_key, :starting_url, :base_uri,
:retrieved, :error_urls, :link_queue, :db_push_queue, :s3_urls, :retries, :domain_id,
:page_jobs, :link_graph, :found_link_list, :outstanding_jobs, :completed_jobs, :link_storage,
:path_based_crawl, :crawl_limit, :is_delegated
def setup_job
@job_starting_time = Time.now
SimpleRecord.establish_connection(S3_ACCESS_KEY, S3_SECRET_KEY, :s3_bucket=> :new)
raise "There is no domain_id supplied.. Aborted !!" if domain_id.blank?
@tmp_db_push_queue = []
@status_checking = false
@is_delegated = is_delegated || false
@crawl_limit = crawl_limit || 1000
@domain_id = domain_id
@domain = CloudDomain.find(@domain_id)
@domain.crawl_finished = 'false'
@domain.already_imported = 'false'
@domain.save
@crawler = @domain.crawler
@starting_uri = URI.parse(@domain.crawl_url)
@base_uri = URI.parse(clean_url(@starting_uri.to_s))
@starting_url = @base_uri.to_s
@retrieve_beat = Time.now
@@heartbeat = Time.now
@@crawled_page_count = 0
@@connections = 0
@@db_connections = 0
@checkstatus_connections = 0
@retries = 0
@s3_urls = []
@outstanding_jobs = []
@job_ids = []
@aggregate_job_processing_duration = 0
@baseurl_uri = URI.parse(@domain.crawl_url)
@transfer_status = false
@delegating_status = false
@outstanding_jobs_transfer = false
@bucket = 'domain_storage'
if @is_delegated.eql?(false)
log "Resetting queues..."
reset_redis_queues
@crawler.set_starting_time
@crawler.set_processing_status('Initializing')
end
true
end
def reset_redis_queues
@crawler.flush_all_redis_information
return true
end
def connecting_database
log "Connecting to Database" if VERBOSE
SimpleWorker.configure do |config|
config.access_key = '6d9aefcf04552c570b239857a56a8cc3'
config.secret_key = 'b87ef0d1d047fe457c2c6381fd1d174c'
username = "uf7wrd8yebt4sj"
password = "p61kv5wfk1trt0vd3w4qfper06"
host_name = "ec2-174-129-213-125.compute-1.amazonaws.com"
database = "dm3tjkjv0whfa7j"
config.database =
:adapter => "postgresql",
:host => host_name, # Not localhost*
:database => database,
:username => username,
:password => password
end
end
def run
connecting_database
log "Setting up job..."
setup_job
if @starting_url.blank?
log "Need a starting URL to crawl."
return false
end
if setup_database_queues
log "Start crawling for real domain ID : #@domain.domain_id"
log "Starting at #Time.now"
setup_delegated_data
do_process
looking_up_transfer_status
log "Ending at #Time.now"
log "Ok."
else
log "Error setting up database queues. Starting URL bad?"
false
end
end
def do_process
@starting_time = Time.now
log "Starting Crawl at #@starting_time..."
do_the_loop
looking_up_for_delegating
@ending_time = Time.now
@total_seconds = @ending_time - @starting_time
@pph = ((@@crawled_page_count / @total_seconds) * 3600.0).to_i
log "Ending loop: Total time #@total_seconds seconds, total urls #@@crawled_page_count (#@pph pages/hr)"
cost_estimate = ((@aggregate_job_processing_duration / 1000.0) / 3600.0) * 0.05
log "Job Time: #@aggregate_job_processing_duration, estimated cost $#cost_estimate "
end
def looking_up_for_delegating
log "looking_up_for_delegating"
if @delegating_status.eql?(true)
log "\n * Setup delegated data for next job.. "
@crawler.flush_visit_key
@crawler.flush_skipped_url
@crawler.flush_error_url
@crawler.flush_retries
@crawler.flush_queued_url
@crawler.flush_todo
@crawler.flush_write_out_key
@crawler.flush_db_push_queue_from_s3
@crawler.flush_db_push_queue_key_list
@crawler.set_crawled_count(@@crawled_page_count)
@todo.size.times.each |f| @crawler.add_todo(@todo.pop)
@visit_key.size.times.each |f| @crawler.add_visit_key(@visit_key.pop)
@skipped_urls.size.times.each |f| @crawler.add_skipped_url(@skipped_urls.pop)
@queued_key.each |url| @crawler.add_queued_url(url)
@error_urls.size.times.each |f| @crawler.add_error_url(@error_urls.pop)
EM.run
@db_push_queue.size.times.each do
# @db_push_queue.pop |x| @crawler.add_db_push_queue(x.to_json);
@db_push_queue.pop |x| @crawler.add_db_push_queue_to_s3(x);
end
EM.stop
redex = @crawler.redex_crawler
redex.crawl_limit = @crawl_limit
redex.is_delegated = true
job_id = redex.queue(:recursive => true)
@crawler.red_ex_job_id = job_id["task_id"]
@crawler.save
log "\n * New Job : #job_id['task_id']"
log "\n * Delegating the process to Job ID : #@crawler.red_ex_job_id"
end
end
def setup_delegated_data
if @is_delegated.eql?(true)
log "setup_delegated_data"
log "\n\t * Get delegated data for last job..."
@crawler.get_visit_key.each |url| @visit_key << url
@crawler.get_skipped_urls.each |url| @skipped_urls << url
@crawler.get_queued_urls.each |url| @queued_key << url
@crawler.get_todo.each |url| @todo << url
@crawler.get_error_urls.each |url| @error_urls << url
@crawler.get_oustanding_jobs.each |job_id| @outstanding_jobs << :job_id => job_id, :redis_status => true
@@crawled_page_count = @crawler.get_crawled_count.to_i
@retries = @crawler.get_retries_count.to_i
@crawler.get_db_push_key_list.each do |key|
db_push_queue = @crawler.get_db_push_queue_from_s3(key)
@tmp_db_push_queue.push(db_push_queue) if db_push_queue.is_a? Hash
puts @tmp_db_push_queue.size.to_s + "-" + db_push_queue[:url]
end
end
end
def update_outstanding_jobs
jobs = @outstanding_jobs.select |job| job[:redis_status].eql?(false)
jobs.each do |job|
# @redis.sadd(@crawler.outstanding_jobs_redis_key, job[:job_id])
begin
@crawler.add_oustanding_jobs(job[:job_id])
@crawler.increment_total_jobs
job[:redis_status] = true
rescue Timeout::Error => e
job[:redis_status] = false
next
end
end
@crawler.set_processing_status('Processing')
@crawler.set_crawled_count(@@crawled_page_count)
@crawler.set_retries_count(@retries)
end
def looking_up_transfer_status
if @transfer_status.eql?(true)
@domain.crawl_finished = "true"
@domain.save
@real_domain = Clean::Domain.find @domain.domain_id.to_i
@real_domain.last_crawl_date = Time.now
@real_domain.next_crawl_date = @real_domain.project.company.crawl_frequency_range
@real_domain.next_page_severity_update = Time.now + 1.day rescue nil
@real_domain.save(:validate => false)
@crawler.set_finished_time
@crawler.set_processing_status('Finished')
log "Domain ID :#@real_domain.id"
log "Last Crawl Date : " + @real_domain.last_crawl_date.to_s
log "Next Crawl Date : " + @real_domain.next_crawl_date.to_s
end
end
def pushing_db_push_queue_into_em
log "pushing_db_push_queue_into_em"
@tmp_db_push_queue.each do |queue|
@db_push_queue.push(queue)
end
end
def do_the_loop
@crawler.set_processing_status('Processing')
EM.run do
@redis = EM::Protocols::Redis.connect REDIS_OPTIONS_HASH
@db_push_queue ||= EM::Queue.new
pushing_db_push_queue_into_em if @is_delegated.eql?(true)
@@heartbeat = Time.now
log "\nBeginning RedEx Crawler Processing Loop..."
EM.add_periodic_timer(60)
if (Time.now - @@heartbeat) > 60
log "Exiting: Heartbeat Not Detected for more than 60 seconds."
update_outstanding_jobs
marking_crawler_as_done
EM.stop
end
if (Time.now - @job_starting_time) > 2700
log "\t Hit 45 minutes.. Delegating data.. "
@delegating_status = true
@@connections = 51
@@db_connections = 21
update_outstanding_jobs
EM.stop
end
EM.add_periodic_timer(60)
if @outstanding_jobs_transfer.eql?(false)
@outstanding_jobs_transfer = true
EM.defer(proc
update_outstanding_jobs
, proc
@outstanding_jobs_transfer = false
log "Outstanding jobs on Redis updated.."
)
end
EM.add_periodic_timer(5)
update_logs_with_current_status
if @@db_connections.to_i < SIMULTANEOUS_DB_CONNECTIONS.to_i and @delegating_status.eql?(false)
@db_push_queue.pop |x| write_to_db(x) rescue nil unless @db_push_queue.blank?
else
log "\n\n\n** Redex Thinks that there are either too many simultaneous DB connections or the Delegating Status == false"
log "DB Connections: #@@db_connections, Delegating Status: #@delegating_status\n\n\n"
end
if completed_retrieval?
# log "\n* Completed Retrieval and Page Processing.."
log "\n* Completed Retrieval.."
log "\n* Stoping EM.."
update_outstanding_jobs
marking_crawler_as_done
EM.stop
end
EM.add_periodic_timer(1)
if (Time.now - @retrieve_beat) > 5
unless @todo.empty? or @@connections > CONCURRENT_CONNECTIONS or @@crawled_page_count > @crawl_limit or @delegating_status.eql?(true)
retrieve(@todo.pop) if @db_push_queue.size <= 500
end
end
retrieve(@todo.pop) unless @todo.blank?
end
end
def setup_database_queues
begin
@queued_key = []
@todo = Queue.new
@visit_key = Queue.new
@skipped_urls = Queue.new
@error_urls = Queue.new
if @is_delegated.eql?(false)
@todo << @starting_url
@queued_key << @starting_url
end
@retrieved = []
true
rescue
false
end
end
def clean_url(found_url)
begin
a = URI.parse(found_url)
a.fragment = nil
a.path = "/" if a.path.blank?
return a.to_s
rescue => e
log "Error with #found_url : #e.inspect"
return false
end
end
def valid_scheme?(uri)
["http", "https"].include?(uri.scheme)
end
def retrieve(url)
begin
@@heartbeat = Time.now
req = EventMachine::HttpRequest.new(url).get :head => "Accept" => "text/html", "Accept-Encoding" => "UTF-8"
@visit_key << url
@@crawled_page_count += 1
@@connections += 1
req.callback do
@@connections -= 1
@@heartbeat = Time.now
page = RedExPage.new(:url => url, :base_uri => @base_uri, :headers => req.response_header, :code => req.response_header.status, :content => req.response)
page.callback do |page_hash|
if [200].include?(page_hash[:code])
page_hash[:links].each do |link|
@@heartbeat = Time.now
setup_new_retrieval
uri = strip_off_fragment(link) rescue next
next unless valid_scheme?(uri)
uri = to_absolute(uri)
if same_host?(uri) and in_path?(uri)
unless @queued_key.include?(uri.to_s)
link = UrlValidator.new(uri.to_s)
filetype = link.filetype.blank? ? '' : link.filetype.downcase
if DO_NOT_CRAWL_TYPES.include?(".#filetype")
@skipped_urls << uri.to_s
next
end
unless @queued_key.length > @crawl_limit
@todo.push(uri.to_s)
@queued_key << uri.to_s
end
end
end
end # page_hash_each
elsif [301,302,404].include?(page_hash[:code])
elsif [503].include?(page_hash[:code])
@retries += 1
@todo.push(url)
else
log "[RedEx] Code type #page_hash[:code] not supported."
end
if [200,301,302,404,500].include?(page_hash[:code])
@db_push_queue.push(page_hash)
end
end
end
req.errback do
@@heartbeat = Time.now
@@connections -= 1
setup_new_retrieval
if [301,302,404,500].include?(req.response_header.status)
page = RedExPage.new(:url => url, :base_uri => @base_uri, :headers => req.response_header, :code => req.response_header.status, :content => req.response)
page.callback do |page_hash|
@db_push_queue.push(page_hash)
end
elsif [503].include?(req.response_header.status)
@retries += 1
@todo.push(url)
else
@error_urls << url
end
end
rescue => e
if @@connections.eql?(0)
log "Parsing error, stopping. URL: #url"
EM.stop
else
log "[Error On Retrieve] => #e.inspect"
end
end
end
def check_done
if @todo.empty? and @@connections == 0
EM.stop
end
end
def to_absolute(uri)
uri.relative? ? @base_uri.merge(uri) : uri
end
def same_host?(uri)
@base_uri.host.eql?(uri.host)
end
def in_path?(uri)
uri.path.index(@base_uri.path).eql?(0)
end
def do_queuer_loop
log "do_queuer_loop"
log "\n\t * Starting SW queuer.."
@@db_connections = 0
EM.run do
EM.add_periodic_timer(60) do
if (Time.now - @job_starting_time) > 3000
log "\t Hit 50 minutes.. Delegating data.. "
@delegating_status = true
EM.stop
end
end
EM.add_periodic_timer(1) do
if !@s3_urls.empty?
available_db_connections = SIMULTANEOUS_DB_CONNECTIONS - @@db_connections
new_connections = if @s3_urls.size > available_db_connections
@s3_urls.size
else
available_db_connections
end
EM::Iterator.new(0..new_connections).each do |num, iter|
s3_url = @s3_urls.pop
queue_into_sw!(s3_url) unless s3_url.blank?
iter.next
end
else
EM.stop
end
end
EM.add_periodic_timer(15) do
log "S3 URLS : #@s3_urls.size, DB Connections : #@@db_connections"
if @s3_urls.empty? and @@db_connections.eql?(0)
log '* Completed SW queuer..'
EM.stop
end
end
end
end
def queue_into_sw!(s3_url)
@@heartbeat = Time.now
EM.defer(proc
wq = RedWorkQueuer.new(@crawler.id, s3_url)
wq.callback do |obinfo|
@@heartbeat = Time.now
if obinfo["task_id"].blank?
log "Error::Queueing into SW::Task ID is blank: #obinfo["task_id"]"
else
@outstanding_jobs << :job_id => obinfo["task_id"], :redis_status => false
@job_ids << obinfo["task_id"]
end
ret = :task_id => obinfo["task_id"], :s3_url => s3_url, :ob_info => obinfo
ret
end
wq.errback do
log "Error::Queuing into SW failed S3URL: #s3_url"
end
, proc |hash_values|
# log "Queued: #hash_values.inspect"
)
end
def write_to_db(page_hash)
log "Write_To_DB starting" if VERBOSE
@@heartbeat = Time.now
@@db_connections += 1
begin
pagedigest = Digest::MD5.hexdigest(page_hash[:url])
url = page_hash[:url] + "_#pagedigest"
begin
marshal_dump = Marshal.dump(page_hash)
rescue => e
@@db_connections -= 1
log "Error to dump object for URL : #page_hash[:url].. Skip.."
return true
end
on_error = Proc.new do |http|
@@heartbeat = Time.now
log "WriteToDb::HappeningWrite::Error::#http.response_header.status"
@error_urls << page_hash[:url]
@@db_connections -= 1
end
s3_url = storage_url(url)
item = Happening::S3::Item.new(@bucket, s3_url, :aws_access_key_id => S3_ACCESS_KEY, :aws_secret_access_key => S3_SECRET_KEY)
item.put(marshal_dump, :on_error => on_error) do |resp|
log "Put #s3_url with Happening"
@@db_connections -= 1
queue_into_sw!(s3_url) unless s3_url.blank?
end
rescue => e
if e.inspect.include?('Happening::Error')
@@db_connections -= 1
log "Error to store with Happening S3 for URL : #page_hash[:url].. Skip.."
return true
else
puts e.inspect
puts e.backtrace.join("\n") if e.backtrace
end
end
if @@db_connections.to_i < SIMULTANEOUS_DB_CONNECTIONS.to_i and @delegating_status.eql?(false)
@db_push_queue.pop |x| write_to_db(x) rescue nil
end
end
def completed_retrieval?
if (@@crawled_page_count > @crawl_limit) and @@connections.eql?(0) and @db_push_queue.size.eql?(0) and @@db_connections.eql?(0)
true
elsif @todo.empty? and @@connections.eql?(0) and @db_push_queue.size.eql?(0) and @@db_connections.eql?(0)
true
else
false
end
end
def completed_page_processing?
if @outstanding_jobs.size > 0
false
elsif @outstanding_jobs.size == 0 && @completed_jobs.size > 0
true
else
log "Falling through condition on RedEx on completed page processing. error check"
false
end
end
def marking_crawler_as_done
# log "Setting Domain and initiating transfer of the new page data."
log "Setting Domain."
@transfer_status = true
end
def strip_off_fragment(url)
uri = URI.parse(url)
unless uri.fragment.blank?
non_fragment = uri.to_s.gsub("##uri.fragment", '')
uri = URI.parse(non_fragment)
end
uri.path = "/" if uri.path.blank?
return uri
end
def setup_new_retrieval
unless @todo.empty? or (@@connections > CONCURRENT_CONNECTIONS) or (@@crawled_page_count > @crawl_limit) or @delegating_status.eql?(true)
if @db_push_queue.size <= 500
@retrieve_beat = Time.now
retrieve(@todo.pop)
end
end
end
def base_key
current_time = @starting_time
day = current_time.day
month = current_time.month
year = current_time.year
base_url = @baseurl_uri.to_s.gsub("http://", "")
base_key = "#base_url@-@#year-#month-#day/"
return base_key
end
def storage_url(url)
"#base_key#CGI.escape(url)"
end
def update_logs_with_current_status
log "\n\n\n#Time.now - Running Time: #Time.now - @starting_time seconds\n"
log "-- # to write to DB: #@db_push_queue.size, DB Connections : #@@db_connections, Outstanding jobs: #@outstanding_jobs.size"
log "-- Crawled Count: #@@crawled_page_count, Visited: #@visit_key.size, Touched: #@queued_key.length, Todo: #@todo.length, Connections: #@@connections, Retries: #@retries, Error: #@error_urls.size\n\n\n"
end
end
ruby使用mechanize的hacky爬虫(代码片段)
ruby轻量级并行web图形爬虫(代码片段)
csscss声明字体模板基于em的基础(代码片段)
基于golang的爬虫实战(代码片段)
基于golang的爬虫实战前言爬虫本来是python的强项,前期研究过scrapy,也写过一些简单的爬虫小程序,但是后来突然对golang产生兴趣,决定写写爬虫练练手。由于本人golang萌新,有错误之处,欢迎指正。大致思路由于现在动态页面... 查看详情
json基于夜视仪的网络爬虫样本。(代码片段)
javascript基于phantomjs的简单网络爬虫库(代码片段)
基于scrapy的b站爬虫(代码片段)
基于Scrapy的B站爬虫最近又被叫去做爬虫了,不得不拾起两年前搞的东西。说起来那时也是突发奇想,想到做一个B站的爬虫,然后用的都是最基本的Python的各种库。不过确实,实现起来还是有点麻烦的,单纯一个下载,就有很多... 查看详情
知乎爬虫(基于selenium)(代码片段)
今天写一下关于知乎的爬虫。利用selenium实现爬去数据.思路:打开网页选择登录界面-------->选择二维码登录------>点击“发现”------>在输入框中输入要查询的内容,回车--------->把滚动条下拉到最下面------------->获取所... 查看详情
小说爬虫(基于requests+beautifulsoup)(代码片段)
最近老是写selenium的爬虫,想复习下requests+BeautifulSoup爬取网站内容。先写一下思路:打开网站,获取网站的首页显示的小说-------------->根据输入的内容来进行判断是否含有该小说,有,就对该小说进行访问。------------->打开... 查看详情
爬虫-requests模块(代码片段)
...明轮子症、啃文档症、抑郁、头疼、甚至死亡。今日概要基于requests的get请求基于requests模块的post请求基于requests模块ajax的get请求基于requests模块ajax的post请求综合项目练 查看详情
python爬虫实战-基于代理池的高并发爬虫(代码片段)
最近在写一个基于代理池的高并发爬虫,目标是用单机从某网站API爬取十亿级别的JSON数据。代理池有两种方式能够实现爬虫对代理池的充分利用:搭建一个TunnelProxy服务器维护代理池在爬虫项目内部自动切换代理所谓TunnelProxy实... 查看详情
基于selenium的bilibili登录爬虫,解决汉字验证的问题(代码片段)
b站近日把登录页面的验证方式从滑块验证改为了汉字验证,我看网上也没用相关的爬虫教程,所以自己写了一个,作为b站爬虫参考。 fromseleniumimportwebdriverfromselenium.webdriver.common.action_chainsimportActionChainsfromselenium.webdriver.support... 查看详情
基于web的爬虫系统设计与实现(代码片段)
全套资料下载地址:https://download.csdn.net/download/sheziqiong/85585280全套资料下载地址:https://download.csdn.net/download/sheziqiong/85585280目录1绪论11.1选题背景及意义11.1.1选题背景11.1.2目的及意义11.2国内外发展现状21.2.1爬虫技术概述 查看详情
12306火车票查询爬虫(基于selenium)(代码片段)
今天写一下12306火车票查询的爬虫,新手一个,代码方面可能不是那么整洁,望海涵。。。一。这个火车票爬虫感觉还是有点难度的,一些小细节需要考虑。二。还是先讲一下思路: 获得火车票查询URL----->单击‘单程’----... 查看详情
基于scrapy-redis两种形式的分布式爬虫(代码片段)
目录基于scrapy-redis两种形式的分布式爬虫基于scrapy-redis两种形式的分布式爬虫1.scrapy框架是否可以自己实现分布式? -不可以。原因有二。 其一:因为多台机器上部署的scrapy会各自拥有各自的调度器,这样就... 查看详情
python爬虫编程思想(106):基于splash的爬虫--异步处理与go函数(代码片段)
Splash支持异步处理,例如,go函数就是通过异步方式访问页面的,不过go函数并不能指定异步回调函数,所以在调用go函数后,需要使用wait函数等待一会,这样可以给页面装载留有一定的时间... 查看详情
ruby基于poltergeist(phantomjs)的webcrawlerhelper类。使用capybara作为构建webcrawler的框架非常方便(代码片段)
python爬虫小练习:基于xpath的表格信息爬取(代码片段)
文章目录确定目标和分析思路目标思路观察情况爬取名单表爬取详情页二级信息爬虫请遵守相关法律法规,不要做违法犯罪的事情爬虫小技巧总结这是一个Python爬虫的入门练习,我们通过Request请求数据,并通过XPath去... 查看详情