java网络爬虫(代码片段)

author author     2022-12-24     271

关键词:

import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;

/**
 * Created by Piyush Juneja on 3/31/17.
 */
public class Word implements Serializable 
    String word;
    ArrayList<Integer> postings;
    public static final long serialVersionUID = -3696191086353573895L;
    int urlID;

    public Word(String word, int urlID) 
        postings = new ArrayList<>();
        this.word = word;
        this.postings.add(0, urlID);
    

    public void addURLID(int urlID) 
        postings.add(urlID);
    

    public String getWord() 
        return word;
    

    public List<Integer> getList() 
        return postings;
    

    public boolean equals(Object obj) 
        Word w = (Word) obj;
        return this.word.equals(w.getWord());
    



    public void addURLID(int urlID) 
        postings.add(urlID);
    

    public String getWord() 
        return word;
    

    public List<Integer> getList() 
        return postings;
    

    public boolean equals(Object obj) 
        Word w = (Word) obj;
        return this.word.equals(w.getWord());
    

import java.io.Serializable;

/**
 * Created by Sahil Pattni on 17-Apr-17.
 */
public class SearchThread implements Serializable, Runnable 
    int start;
    int finish;
    String[] terms;

    public SearchThread(int start, int finish, String[] terms) 
        this.start = start;
        this.finish = finish;
        this.terms = terms;
    

    public Word findTerm(String term) 
        for (Word word : Search.wordList) 
            if (word.getWord().equalsIgnoreCase(term)) 
                return word;
            
        
        return null;
    

    public void run() 
        for (String term : terms)  //For each term searched
            for (int i = start; i <= finish; i++) 
                if (Search.wordList.get(i).getWord().equalsIgnoreCase(term))  //if Word was found in Search.wordList  || CHANGELOG: replaced findTerm(term) != null
                    for (int ID : Search.wordList.get(i).getList())  //Parse through link postings for given words
                        String url = Search.pageList.get(ID).getURL();
                        Result result = new Result(url , ID); //Store each link as a result

                        if (Search.resultSet.contains(result))  //if array of results already contains current result
                            int index = Search.resultSet.indexOf(result);
                            Search.resultSet.get(index).incrementScore(); //Increment score of current result object
                        
                        else
                            Search.resultSet.add(result); //if result doesn't exist in result array, add to array;
                    
                
            
        
    

import java.io.File;
import java.io.Serializable;
import java.sql.ResultSet;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;

/**
 * Created by Sahil Pattni on 17-Apr-17.
 */
public class Search 
    static List<Page> pageList;
    private String pageListFile;
    static List<Result> resultSet;
    static List<Word> wordList;
    private String wordListFile;
    private FileUtils fl;

    public Search(String wordListFile, String pageListFile) 
        this.wordListFile = wordListFile;
        this.pageListFile = pageListFile;
        fl = new FileUtils();
        resultSet = Collections.synchronizedList(new ArrayList<Result>()); //code from project page on sync
        setup(wordListFile, pageListFile);
    

    public synchronized void setup(String wordListFile, String pageListFile) 
        try 
            pageList = fl.getPageList(pageListFile);
            wordList = fl.getWordList(wordListFile);
        
        catch (Exception e) 
            e.printStackTrace();
        
    

    public List<Result> executeQuery(String query) 
        String[] keywords = query.split(" ");
        Thread[] threads = new Thread[5];
        int totalSize = wordList.size();
        int divided = totalSize/5;
        int endPoint = totalSize/5;  //Variable endpoint for SearchThread constructor
        int startPoint = 0;         //Variable startpoint for SearchThread constructor
        int currentThread = 0;  //current thread position in array

        for (Thread thread : threads) 
            thread = new Thread(new SearchThread(startPoint, endPoint, keywords)); //Start thread
            threads[currentThread] = thread; //Assign thread to current position in array
            if (startPoint == 0)
                startPoint += divided + 1;
            else
                startPoint+=(totalSize/5);

            if (endPoint+divided < totalSize-5)
                endPoint += divided;
            else
                endPoint = totalSize -1;

            currentThread++;
        
        for (Thread thread : threads) 
            thread.start();
        
        for (Thread thread : threads) 
            try 
                thread.join();
             catch (Exception e) 
                e.printStackTrace();
            
        
        sort();

        return resultSet;
    

    public void nullCheck() 
        if (pageList == null || wordList == null)
            setup(this.wordListFile, this.pageListFile);
    
    public void sort() 
        Collections.sort(resultSet);
    

import java.io.Serializable;

/**
 * Created by Sahil Pattni on 17-Apr-17.
 */
public class Result implements Serializable, Comparable<Result> 
    public int score;
    public static final long serialVersionUID = -938761094876384658L;
    public String url;
    public int urlID;

    public Result(String url, int urlID) 
        this.url = url;
        this.urlID = urlID;
        score = 1;
    

    public void updateScore(int score) this.score += score; //TODO: FIX: PARAM IS ALWAYS 783

    public void incrementScore() score++;

    public int getScore() return score;

    public String getURL() return url;

    public int getURLID() return urlID;

    @Override
    public boolean equals(Object obj) 
        if (obj instanceof Result) 
            Result result = (Result) obj;
            if (this.url.equals(result.url) || this.urlID == result.urlID) 
                return true;
            
        
        return false;
    

    public int compareTo(Result candidate) 
        if (this.score > candidate.score)
            return -1;
        else if (this.score < candidate.score)
            return 1;
        else
            return 0;
    



import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import javax.print.Doc;
import java.io.IOException;
import java.io.Serializable;

/**
 * Created by SahilMPattni on 3/31/2017.
 */
public class Parser 
    public Parser() 

    public Document getDocument(String url) throws ParseException 
        Document d = null;

        //Custom Exceptions
        if (url == null)
            throw new ParseException("getDocument() failed. String url is null.");
        else if (url.equals(""))
            throw new ParseException("getDocument() failed. String url is empty.");

            //Try-Catch
        else 
            try 
                System.out.println(url);
                d = Jsoup.connect(url).timeout(3000).get();
             catch (Exception e) 
                throw new ParseException("getDocument() failed. Connection failed.");
            
            if (d == null)
                throw new ParseException("getDocument() failed. Docoument is null.");
        
        return d;
    

    public Elements getLinks(Document doc) throws ParseException 
        if (doc == null)
            throw new ParseException("getLinks() failed. Document parameter is null.");
        else
            return doc.select("a[href]");
    

    public String getBody(Document doc) throws ParseException 
        if (doc == null)
            throw new ParseException("getBody() failed. Document parameter is null.");

        Element content = doc.body();

        if (content != null)
            return content.text();
        else
            return "";
    

    public static void main(String[] args) throws ParseException 
        Parser p = new Parser();
        Document doc = p.getDocument("http://www.cs.purdue.edu");
        Element body = doc.body();
        String content = body.text();
        System.out.println(content);
        //for (Element e : links)
            //System.out.println(e.attr("abs:href"));
    

import java.io.Serializable;

/**
 * Created by SahilMPattni on 3/31/2017.
 */
public class ParseException extends Exception 
    public ParseException(String message) 
        super(message);
    

import java.io.Serializable;

/**
 * Created by piyushjuneja on 3/31/17.
 */
public class Page implements Comparable, Serializable 
    public static final long serialVersionUID = -1827677255104766839L;
    String url;
    private int urlID;


    public Page(String url, int urlID) 
        this.url = url;
        this.urlID = urlID;
    


    public String getURL() 
        return this.url;
    

    public int getURLID() 
        return this.urlID;
    

    @Override
    public int compareTo(Object o) 
        Page candidate = (Page) o;
        if (candidate.getURLID() < this.getURLID()) return -1; // Less than this
        else if (candidate.getURLID() > this.getURLID()) return 1; // Greater than this
        else return 0; // equal to this
    

    public boolean equals(Object obj) 
        Page candidate = (Page) obj;
        if (candidate.getURL().equals(this.getURL()) ||
                candidate.getURLID() == this.getURLID())
            return true;
        else
            return false;
    

import java.io.Serializable;

public class Node 

    private Object data;
    private Node next;
    private Node prev;

    public Node(Object obj) 
        this.data = obj;
    

    public void setNext(Node next) 
        this.next = next;
    

    public void setPrev(Node prev) 
        this.prev = prev;
    

    public Node getNext() 
        return this.next;
    

    public Node getPrev() 
        return this.prev;
    

    public Object getData() 
        return this.data;
    
import java.io.Serializable;

public class MyQueue 
    int count;
    Node head;
    Node tail;

    public MyQueue() 
        head = new Node(null);
        tail = new Node(null);
        count = 0;
        head.setNext(tail);
    


    public void add(Object o) 
        if(o == null)
            return;
        Node temp = new Node(o);
        if (head.getData() == null) 
            head = new Node(o);
            count++;
            head.setNext(tail);
        
        else if(head.getNext().getData() != null) 
            Node ptr = head.getNext();
            while(ptr.getNext() != null) 
                if(ptr.getNext().getData() == null) 
                    ptr.setNext(temp);
                    temp.setNext(tail);
                    count++;
                    break;
                
                ptr = ptr.getNext();
            
        
        else 
            head.setNext(temp);
            temp.setNext(tail);
            count++;
        
    

    public Node peek() 
        if(isEmpty()) return null;
        return head;
    

    public synchronized Node remove()  //CHANGELOG: added synchronized
        if(isEmpty())
            return null;
        else 
            Node toReturn = head;
            head = head.getNext();
            count--;
            //System.out.println(toReturn.getData());
            return toReturn;
            
        

    public boolean isEmpty() 
        return count == 0;
    

    public int size() 
        return count;
    

import java.io.*;
import java.util.ArrayList;
import java.util.List;

/**
 * Created by Sahil Pattni on 16-Apr-17.
 */
public class FileUtils 

    public boolean saveWordTable(List<Word> wordTable, String filePath)
    
        if (wordTable == null || filePath == null)
            return false;
        try
        
            FileOutputStream fos = new FileOutputStream(filePath);

            ObjectOutputStream oos = new ObjectOutputStream(fos);

            oos.writeObject(wordTable);

            oos.close();
        
        catch(Exception e)
        
            e.printStackTrace();
            return false;
        

        return true;
    

    public boolean savePageTable(List<Page> pageTable, String filePath)
    
        if (pageTable == null || filePath == null)
            return false;
        FileOutputStream fos = null;
        ObjectOutputStream oos = null;
        try
        
            fos = new FileOutputStream(filePath);

            oos = new ObjectOutputStream(fos);

            oos.writeObject(pageTable);

        
        catch(Exception e)
        
            e.printStackTrace();
            return false;
        
        finally
        
            try
            
                fos.close();
                oos.close();
             catch (IOException e) 
                e.printStackTrace();
            
        

        return true;
    

    public List<Page> getPageList(String filePath)
    
        FileInputStream fis = null;
        ObjectInputStream ois = null;
        if (filePath == null)
            return null;
        try
        
            fis = new FileInputStream(filePath);
            ois = new ObjectInputStream(fis);

            List<Page> pages = (ArrayList<Page>) ois.readObject();
            return pages;
        
        catch(Exception e)
        
            e.printStackTrace();
        
        finally
        
            try
            
                ois.close();
                fis.close();


             catch (IOException e) 
                e.printStackTrace();
            
        
        return null;
    

    public List<Word> getWordList(String filePath)
    
        FileInputStream fis = null;
        ObjectInputStream ois = null;
        if (filePath == null)
            return null;
        try
        
            fis = new FileInputStream(filePath);
            ois = new ObjectInputStream(fis);

            List<Word> words = (ArrayList<Word>) ois.readObject();
            return words;
        
        catch(Exception e)
        
            return null;
        
        /*finally
        
            try
            
                assert ois != null;
                ois.close();
                fis.close();


             catch (IOException e) 
                e.printStackTrace();
            
        */
    


import sun.swing.FilePane;

import java.io.File;
import java.util.Collections;
import java.util.List;
import java.util.Scanner;

/**
 * Created by Sahil Pattni on 17-Apr-17.
 */
public class Driver 

    private FileUtils fu;
    private List<Page> pageFile;
    private List<Word> wordFile;

    public void crawl() 
        Crawler cl = new Crawler("https://www.investing.com/", "www.investing.com", 10);
        cl.crawl();
    

    public static void main(String[] args) 
        Driver d = new Driver();
        d.crawl();
        d.save();
        
        Scanner s = new Scanner(System.in);
        boolean doneSearching = false;

        while (!doneSearching) 
            System.out.println("Enter Query");
            String query = s.nextLine();
            d.search(query);
            System.out.println();
            System.out.println("Do you want to continue (yes/no)");
            String answer = s.nextLine();
            if (answer.equalsIgnoreCase("no"))
                doneSearching = true;

        
    

    public void save() 
        fu = new FileUtils();
        fu.savePageTable(Crawler.parsed,"C:\\Users\\Sahil Pattni\\Desktop\\IntelliJ\\HW 10\\parsed.txt");
        fu.saveWordTable(Crawler.words, "C:\\Users\\Sahil Pattni\\Desktop\\IntelliJ\\HW 10\\words.txt");
    

    public void search(String query) 
        String pageLocation = "C:\\Users\\Sahil Pattni\\Desktop\\IntelliJ\\HW 10\\parsed.txt";
        String wordLocation = "C:\\Users\\Sahil Pattni\\Desktop\\IntelliJ\\HW 10\\words.txt";
        Search s = new Search(wordLocation, pageLocation);
        List<Result> results = s.executeQuery(query);
        Collections.sort(results);

        //Output
        int currentResult = 0;
        System.out.println("Query: " + query);
        for (Result result: results) 
            System.out.println("("+currentResult+")" + result.getURL() + " | " +
            "score: " + result.getScore());
        
    


import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import javax.print.Doc;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;

/**
 * Created by Sahil Pattni on 04-Apr-17.
 */
public class Crawler 
    static String seedURL;
    static int currentID;
    static String domain;
    static int limit;
    MyQueue toParse;
    static Parser parser;

    static int totalURLs;
    static List<Page> parsed  = new ArrayList<>();
    static List<String> visited = new ArrayList<>();
    static List<Word> words = new ArrayList<>();

    public Crawler(String seed, String domain, int limit) 
        currentID = 0;
        totalURLs = 0;
        Crawler.seedURL = seed;
        Crawler.domain = domain;
        Crawler.limit = limit;

        parser = new Parser();
        toParse = new MyQueue();

        toParse.add(seed);
    

    public void crawl()
        while (!toParse.isEmpty() && currentID < limit) 
            if (toParse.peek().getData() != null)  //if next node to be parsed is not null
                String link = (String) toParse.remove().getData(); //remove node from queue and parse
                if (isValidURL(link)) 
                    if (!visited.contains(link))  //if link has not been previously visited
                        Page p = new Page(link, currentID);
                        if (!pageExists(p))  //custom method to test whether page has already been created
                            try 
                                Document d = parser.getDocument(link);
                                if (parse(d, currentID)) 
                                    currentID++;
                                    addPageToList(p); //Add page to list of parsed pages
                                
                            
                            catch (ParseException e) 
                                e.printStackTrace();
                            
                        
                        visited.add(link); //Add links to visited regardless of whether they have been parsed
                    
                
            
            else 
                break;
            
        
    

    public boolean parse(Document doc, int id) 
        boolean returner = false;
        try 
            parseLinks(doc);
            parseText(doc, id);
            returner = true;
         catch (ParseException e) 
            e.printStackTrace();
        
        return returner;
    

    public boolean pageExists (Page p) 
        for (Page page : parsed) 
            if (page.equals(p))
                return true;
        

        return false;
    

    public static void main(String[] args) throws ParseException 
        Crawler cl = new Crawler("https://www.cs.purdue.edu/homes/bxd/", "www.cs.purdue.edu", 50);
        Parser p = new Parser();
        Document d = p.getDocument("http://www.cs.purdue.edu/homes/cs177");
        cl.parse(d, currentID);
    

    public void parseLinks(Document doc) throws ParseException
        Elements links = parser.getLinks(doc);
        boolean exists = false;
        for (Element element : links) 
            for (String string : visited) 
                if (element.hasText()) 
                    if (element.equals(string))
                        exists = true;
                

            
            if (!exists)
                toParse.add(element.attr("abs:href"));
        
    
    public void parseText(Document doc, int id) 
        try 
            String text = parser.getBody(doc);  //Gather text
            if (!text.equals(""))  //see getBody() in Parser class
                String[] splitUp = text.split(" "); //Split up into array

                for (String string : splitUp)   //parse through words
                    boolean isInArray = false;  // boolean to check if current word is in array
                    for (Word word : words)  // parse through Words array
                        if (word.getWord().equals(string))  // if the word exists
                            isInArray = true;
                            word.addURLID(id);
                        
                    
                    if (!isInArray) 
                        addWordToList(string.toLowerCase(), id);
                    
                
            
         catch (ParseException e) 
            e.printStackTrace();
        
    
    public void addWordToList(String word, int id) 
        Word wordle = new Word(word, id);
        words.add(wordle);
    
    public void addToQueue(String url) 
        boolean duplicateURL = false;

        for (Page page : parsed) 
            if (page.getURL().equals(url))
                duplicateURL = true;
        

        if (!duplicateURL) 
            toParse.add(url);
            totalURLs++;
        

    

    public void addPageToList(Page p) 
        boolean exists = false;

        for (Page page : parsed) 
            if (page.equals(p))
                exists = true;
        

        if (!exists)
            parsed.add(p);
    

    public boolean isInDomain(String url) 
        if (url.contains(domain))
            return true;
        return false;
    
    public boolean isValidURL(String url) 
        boolean valid = false;
        if ((url.startsWith("https://") || url.startsWith("http://"))) 
            valid = true;
        
        else
            valid = false;

        return valid;
    

java之webmagic网络爬虫(代码片段)

webmagic简介:    WebMagic是一个简单灵活的Java爬虫框架。你可以快速开发出一个高效、易维护的爬虫。    http://webmagic.io/ 准备工作:    Maven依赖(我这里用的Maven创建的web项目做测试):    <dependencies&... 查看详情

java的简单网络爬虫(爬取花瓣网的图片)(代码片段)

...简单的python爬虫,所以在学完java基础后写了一个简单的网络图片爬虫。废话不多说直接上过程代码。(爬取的图源来自花瓣网:https://huaban.com/boards/favorite/beauty/)源url页面分析拿到爬取的源url,首先是分析页面哪些东西是要爬... 查看详情

python项目实战之网络爬虫详解(代码片段)

...述二、原理三、爬虫分类1、传统爬虫2、聚焦爬虫3、通用网络爬虫(全网爬虫)四、网页抓取策略1、宽度优先搜索:2、深度优先搜索:3、最佳优先搜索:4、反向链接数策略:5、PartialPageRank策略:五... 查看详情

java爬虫下载ftp网站目录文件(代码片段)

...写在前面爬虫的本质就是自动化的去模拟正常人类发起的网络请求,然后获取网络请求所返回的数据。跟我们人手动去点击一个连接,访问一个网页获取数据,并没有什么本质的区别。下面用java的方式来爬虫ftp网站... 查看详情

java爬虫入门(代码片段)

通用网络爬虫又称全网爬虫(ScalableWebCrawler),爬行对象从一些种子URL扩充到整个Web,主要为门户站点搜索引擎和大型Web服务提供商采集数据。今天我写的主要是一些皮毛入门现在来看下我们的pom依赖 <projectxmlns="http://maven... 查看详情

爬虫入门(代码片段)

爬虫简单的说网络爬虫(Webcrawler)也叫做网络铲(Webscraper)、网络蜘蛛(Webspider),其行为一般是先“爬”到对应的网页上,再把需要的信息“铲”下来。分类网络爬虫按照系统结构和实现技术,大致可以分为以下几种类型:... 查看详情

text网络爬虫(代码片段)

查看详情

python网络爬虫(代码片段)

查看详情

java网络爬虫,就是这么的简单(代码片段)

这是Java网络爬虫系列文章的第一篇,如果你还不知道Java网络爬虫系列文章,请参看学Java网络爬虫,需要哪些基础知识。第一篇是关于Java网络爬虫入门内容,在该篇中我们以采集虎扑列表新闻的新闻标题和详情页... 查看详情

java学习网络编程全总结——tcpudp多线程io流socket简易在线咨询聊天室java爬虫(代码片段)

目录网络编程1.1、概述1.2、网络通信的要素1.3、IP1.4、端口(port)1.5、通信协议1.6、TCP文件上传Tomcat1.7、UDP发送消息构造方法摘要构造方法摘要咨询DatagramPacket.getData()与DatagramPacket.getLength()的误区trimstartsWith在线咨询:两个人都... 查看详情

xpath语法-爬虫(代码片段)

...过Xpath获取网页中我们想要的内容;为我们的后面学习Java网络爬虫基础准备工作。备注:此章节为基础核心章节,未来会在网络爬虫的数据解析环节经常使用,学会Xpath解析语法,可为未来爬虫解析省去很多麻烦。Xpath简介  ... 查看详情

网络爬虫(代码片段)

1.爬虫流程图2.简单爬虫整个网页的内容--python2importurllib2response=urllib2.urlopen("http://www.baidu.com")html=response.read()print(html) 3.中文乱码处理 #coding:utf-8importre#importrequestsimportsysimportcodecs#p 查看详情

javascript一个简单的网络爬虫(代码片段)

查看详情

java爬爬学习之webmagic(代码片段)

...爬虫的配置、启动和终止Spider爬虫配置Site爬虫分类通用网络爬虫聚焦网络爬虫增量式网络爬虫DeepWeb爬虫案例开发分析数据库表实现流程Scheduler组件三种去重方式使用和定制PipelinePipeline输出已有的Pipeline案例实现引入依赖加入配... 查看详情

网络爬虫是什么?怎么学python爬虫(代码片段)

网络爬虫又称网络蜘蛛、网络机器人,它是一种按照一定的规则自动浏览、检索网页信息的程序或者脚本。网络爬虫能够自动请求网页,并将所需要的数据抓取下来。通过对抓取的数据进行处理,从而提取出有价值的... 查看详情

网络爬虫(代码片段)

1.利用requests.get(url)获取网页页面的html文件importrequestsnewsurl=‘http://news.gzcc.cn/html/xiaoyuanxinwen/‘res=requests.get(newsurl)#返回response对象res.encoding=‘utf-8‘ 2.利用BeautifulSoup的HTML解析器,生成结构树frombs4 查看详情

ruby针对复杂网络的twitter爬虫(代码片段)

查看详情

python网络爬虫课程设计(代码片段)

...什么?(10分)为了通过爬取网站获取的信息来分析现在网络上社会、经济、技术等各种信息网站的影响力排行,以此了解人们对哪种信息网站更青睐,访问的更加频繁。二、主题式网络爬虫设计方案(10分)1.主题式网络爬虫名... 查看详情