关键词:
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
/**
* Created by Piyush Juneja on 3/31/17.
*/
public class Word implements Serializable
String word;
ArrayList<Integer> postings;
public static final long serialVersionUID = -3696191086353573895L;
int urlID;
public Word(String word, int urlID)
postings = new ArrayList<>();
this.word = word;
this.postings.add(0, urlID);
public void addURLID(int urlID)
postings.add(urlID);
public String getWord()
return word;
public List<Integer> getList()
return postings;
public boolean equals(Object obj)
Word w = (Word) obj;
return this.word.equals(w.getWord());
public void addURLID(int urlID)
postings.add(urlID);
public String getWord()
return word;
public List<Integer> getList()
return postings;
public boolean equals(Object obj)
Word w = (Word) obj;
return this.word.equals(w.getWord());
import java.io.Serializable;
/**
* Created by Sahil Pattni on 17-Apr-17.
*/
public class SearchThread implements Serializable, Runnable
int start;
int finish;
String[] terms;
public SearchThread(int start, int finish, String[] terms)
this.start = start;
this.finish = finish;
this.terms = terms;
public Word findTerm(String term)
for (Word word : Search.wordList)
if (word.getWord().equalsIgnoreCase(term))
return word;
return null;
public void run()
for (String term : terms) //For each term searched
for (int i = start; i <= finish; i++)
if (Search.wordList.get(i).getWord().equalsIgnoreCase(term)) //if Word was found in Search.wordList || CHANGELOG: replaced findTerm(term) != null
for (int ID : Search.wordList.get(i).getList()) //Parse through link postings for given words
String url = Search.pageList.get(ID).getURL();
Result result = new Result(url , ID); //Store each link as a result
if (Search.resultSet.contains(result)) //if array of results already contains current result
int index = Search.resultSet.indexOf(result);
Search.resultSet.get(index).incrementScore(); //Increment score of current result object
else
Search.resultSet.add(result); //if result doesn't exist in result array, add to array;
import java.io.File;
import java.io.Serializable;
import java.sql.ResultSet;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
/**
* Created by Sahil Pattni on 17-Apr-17.
*/
public class Search
static List<Page> pageList;
private String pageListFile;
static List<Result> resultSet;
static List<Word> wordList;
private String wordListFile;
private FileUtils fl;
public Search(String wordListFile, String pageListFile)
this.wordListFile = wordListFile;
this.pageListFile = pageListFile;
fl = new FileUtils();
resultSet = Collections.synchronizedList(new ArrayList<Result>()); //code from project page on sync
setup(wordListFile, pageListFile);
public synchronized void setup(String wordListFile, String pageListFile)
try
pageList = fl.getPageList(pageListFile);
wordList = fl.getWordList(wordListFile);
catch (Exception e)
e.printStackTrace();
public List<Result> executeQuery(String query)
String[] keywords = query.split(" ");
Thread[] threads = new Thread[5];
int totalSize = wordList.size();
int divided = totalSize/5;
int endPoint = totalSize/5; //Variable endpoint for SearchThread constructor
int startPoint = 0; //Variable startpoint for SearchThread constructor
int currentThread = 0; //current thread position in array
for (Thread thread : threads)
thread = new Thread(new SearchThread(startPoint, endPoint, keywords)); //Start thread
threads[currentThread] = thread; //Assign thread to current position in array
if (startPoint == 0)
startPoint += divided + 1;
else
startPoint+=(totalSize/5);
if (endPoint+divided < totalSize-5)
endPoint += divided;
else
endPoint = totalSize -1;
currentThread++;
for (Thread thread : threads)
thread.start();
for (Thread thread : threads)
try
thread.join();
catch (Exception e)
e.printStackTrace();
sort();
return resultSet;
public void nullCheck()
if (pageList == null || wordList == null)
setup(this.wordListFile, this.pageListFile);
public void sort()
Collections.sort(resultSet);
import java.io.Serializable;
/**
* Created by Sahil Pattni on 17-Apr-17.
*/
public class Result implements Serializable, Comparable<Result>
public int score;
public static final long serialVersionUID = -938761094876384658L;
public String url;
public int urlID;
public Result(String url, int urlID)
this.url = url;
this.urlID = urlID;
score = 1;
public void updateScore(int score) this.score += score; //TODO: FIX: PARAM IS ALWAYS 783
public void incrementScore() score++;
public int getScore() return score;
public String getURL() return url;
public int getURLID() return urlID;
@Override
public boolean equals(Object obj)
if (obj instanceof Result)
Result result = (Result) obj;
if (this.url.equals(result.url) || this.urlID == result.urlID)
return true;
return false;
public int compareTo(Result candidate)
if (this.score > candidate.score)
return -1;
else if (this.score < candidate.score)
return 1;
else
return 0;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import javax.print.Doc;
import java.io.IOException;
import java.io.Serializable;
/**
* Created by SahilMPattni on 3/31/2017.
*/
public class Parser
public Parser()
public Document getDocument(String url) throws ParseException
Document d = null;
//Custom Exceptions
if (url == null)
throw new ParseException("getDocument() failed. String url is null.");
else if (url.equals(""))
throw new ParseException("getDocument() failed. String url is empty.");
//Try-Catch
else
try
System.out.println(url);
d = Jsoup.connect(url).timeout(3000).get();
catch (Exception e)
throw new ParseException("getDocument() failed. Connection failed.");
if (d == null)
throw new ParseException("getDocument() failed. Docoument is null.");
return d;
public Elements getLinks(Document doc) throws ParseException
if (doc == null)
throw new ParseException("getLinks() failed. Document parameter is null.");
else
return doc.select("a[href]");
public String getBody(Document doc) throws ParseException
if (doc == null)
throw new ParseException("getBody() failed. Document parameter is null.");
Element content = doc.body();
if (content != null)
return content.text();
else
return "";
public static void main(String[] args) throws ParseException
Parser p = new Parser();
Document doc = p.getDocument("http://www.cs.purdue.edu");
Element body = doc.body();
String content = body.text();
System.out.println(content);
//for (Element e : links)
//System.out.println(e.attr("abs:href"));
import java.io.Serializable;
/**
* Created by SahilMPattni on 3/31/2017.
*/
public class ParseException extends Exception
public ParseException(String message)
super(message);
import java.io.Serializable;
/**
* Created by piyushjuneja on 3/31/17.
*/
public class Page implements Comparable, Serializable
public static final long serialVersionUID = -1827677255104766839L;
String url;
private int urlID;
public Page(String url, int urlID)
this.url = url;
this.urlID = urlID;
public String getURL()
return this.url;
public int getURLID()
return this.urlID;
@Override
public int compareTo(Object o)
Page candidate = (Page) o;
if (candidate.getURLID() < this.getURLID()) return -1; // Less than this
else if (candidate.getURLID() > this.getURLID()) return 1; // Greater than this
else return 0; // equal to this
public boolean equals(Object obj)
Page candidate = (Page) obj;
if (candidate.getURL().equals(this.getURL()) ||
candidate.getURLID() == this.getURLID())
return true;
else
return false;
import java.io.Serializable;
public class Node
private Object data;
private Node next;
private Node prev;
public Node(Object obj)
this.data = obj;
public void setNext(Node next)
this.next = next;
public void setPrev(Node prev)
this.prev = prev;
public Node getNext()
return this.next;
public Node getPrev()
return this.prev;
public Object getData()
return this.data;
import java.io.Serializable;
public class MyQueue
int count;
Node head;
Node tail;
public MyQueue()
head = new Node(null);
tail = new Node(null);
count = 0;
head.setNext(tail);
public void add(Object o)
if(o == null)
return;
Node temp = new Node(o);
if (head.getData() == null)
head = new Node(o);
count++;
head.setNext(tail);
else if(head.getNext().getData() != null)
Node ptr = head.getNext();
while(ptr.getNext() != null)
if(ptr.getNext().getData() == null)
ptr.setNext(temp);
temp.setNext(tail);
count++;
break;
ptr = ptr.getNext();
else
head.setNext(temp);
temp.setNext(tail);
count++;
public Node peek()
if(isEmpty()) return null;
return head;
public synchronized Node remove() //CHANGELOG: added synchronized
if(isEmpty())
return null;
else
Node toReturn = head;
head = head.getNext();
count--;
//System.out.println(toReturn.getData());
return toReturn;
public boolean isEmpty()
return count == 0;
public int size()
return count;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
/**
* Created by Sahil Pattni on 16-Apr-17.
*/
public class FileUtils
public boolean saveWordTable(List<Word> wordTable, String filePath)
if (wordTable == null || filePath == null)
return false;
try
FileOutputStream fos = new FileOutputStream(filePath);
ObjectOutputStream oos = new ObjectOutputStream(fos);
oos.writeObject(wordTable);
oos.close();
catch(Exception e)
e.printStackTrace();
return false;
return true;
public boolean savePageTable(List<Page> pageTable, String filePath)
if (pageTable == null || filePath == null)
return false;
FileOutputStream fos = null;
ObjectOutputStream oos = null;
try
fos = new FileOutputStream(filePath);
oos = new ObjectOutputStream(fos);
oos.writeObject(pageTable);
catch(Exception e)
e.printStackTrace();
return false;
finally
try
fos.close();
oos.close();
catch (IOException e)
e.printStackTrace();
return true;
public List<Page> getPageList(String filePath)
FileInputStream fis = null;
ObjectInputStream ois = null;
if (filePath == null)
return null;
try
fis = new FileInputStream(filePath);
ois = new ObjectInputStream(fis);
List<Page> pages = (ArrayList<Page>) ois.readObject();
return pages;
catch(Exception e)
e.printStackTrace();
finally
try
ois.close();
fis.close();
catch (IOException e)
e.printStackTrace();
return null;
public List<Word> getWordList(String filePath)
FileInputStream fis = null;
ObjectInputStream ois = null;
if (filePath == null)
return null;
try
fis = new FileInputStream(filePath);
ois = new ObjectInputStream(fis);
List<Word> words = (ArrayList<Word>) ois.readObject();
return words;
catch(Exception e)
return null;
/*finally
try
assert ois != null;
ois.close();
fis.close();
catch (IOException e)
e.printStackTrace();
*/
import sun.swing.FilePane;
import java.io.File;
import java.util.Collections;
import java.util.List;
import java.util.Scanner;
/**
* Created by Sahil Pattni on 17-Apr-17.
*/
public class Driver
private FileUtils fu;
private List<Page> pageFile;
private List<Word> wordFile;
public void crawl()
Crawler cl = new Crawler("https://www.investing.com/", "www.investing.com", 10);
cl.crawl();
public static void main(String[] args)
Driver d = new Driver();
d.crawl();
d.save();
Scanner s = new Scanner(System.in);
boolean doneSearching = false;
while (!doneSearching)
System.out.println("Enter Query");
String query = s.nextLine();
d.search(query);
System.out.println();
System.out.println("Do you want to continue (yes/no)");
String answer = s.nextLine();
if (answer.equalsIgnoreCase("no"))
doneSearching = true;
public void save()
fu = new FileUtils();
fu.savePageTable(Crawler.parsed,"C:\\Users\\Sahil Pattni\\Desktop\\IntelliJ\\HW 10\\parsed.txt");
fu.saveWordTable(Crawler.words, "C:\\Users\\Sahil Pattni\\Desktop\\IntelliJ\\HW 10\\words.txt");
public void search(String query)
String pageLocation = "C:\\Users\\Sahil Pattni\\Desktop\\IntelliJ\\HW 10\\parsed.txt";
String wordLocation = "C:\\Users\\Sahil Pattni\\Desktop\\IntelliJ\\HW 10\\words.txt";
Search s = new Search(wordLocation, pageLocation);
List<Result> results = s.executeQuery(query);
Collections.sort(results);
//Output
int currentResult = 0;
System.out.println("Query: " + query);
for (Result result: results)
System.out.println("("+currentResult+")" + result.getURL() + " | " +
"score: " + result.getScore());
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import javax.print.Doc;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
/**
* Created by Sahil Pattni on 04-Apr-17.
*/
public class Crawler
static String seedURL;
static int currentID;
static String domain;
static int limit;
MyQueue toParse;
static Parser parser;
static int totalURLs;
static List<Page> parsed = new ArrayList<>();
static List<String> visited = new ArrayList<>();
static List<Word> words = new ArrayList<>();
public Crawler(String seed, String domain, int limit)
currentID = 0;
totalURLs = 0;
Crawler.seedURL = seed;
Crawler.domain = domain;
Crawler.limit = limit;
parser = new Parser();
toParse = new MyQueue();
toParse.add(seed);
public void crawl()
while (!toParse.isEmpty() && currentID < limit)
if (toParse.peek().getData() != null) //if next node to be parsed is not null
String link = (String) toParse.remove().getData(); //remove node from queue and parse
if (isValidURL(link))
if (!visited.contains(link)) //if link has not been previously visited
Page p = new Page(link, currentID);
if (!pageExists(p)) //custom method to test whether page has already been created
try
Document d = parser.getDocument(link);
if (parse(d, currentID))
currentID++;
addPageToList(p); //Add page to list of parsed pages
catch (ParseException e)
e.printStackTrace();
visited.add(link); //Add links to visited regardless of whether they have been parsed
else
break;
public boolean parse(Document doc, int id)
boolean returner = false;
try
parseLinks(doc);
parseText(doc, id);
returner = true;
catch (ParseException e)
e.printStackTrace();
return returner;
public boolean pageExists (Page p)
for (Page page : parsed)
if (page.equals(p))
return true;
return false;
public static void main(String[] args) throws ParseException
Crawler cl = new Crawler("https://www.cs.purdue.edu/homes/bxd/", "www.cs.purdue.edu", 50);
Parser p = new Parser();
Document d = p.getDocument("http://www.cs.purdue.edu/homes/cs177");
cl.parse(d, currentID);
public void parseLinks(Document doc) throws ParseException
Elements links = parser.getLinks(doc);
boolean exists = false;
for (Element element : links)
for (String string : visited)
if (element.hasText())
if (element.equals(string))
exists = true;
if (!exists)
toParse.add(element.attr("abs:href"));
public void parseText(Document doc, int id)
try
String text = parser.getBody(doc); //Gather text
if (!text.equals("")) //see getBody() in Parser class
String[] splitUp = text.split(" "); //Split up into array
for (String string : splitUp) //parse through words
boolean isInArray = false; // boolean to check if current word is in array
for (Word word : words) // parse through Words array
if (word.getWord().equals(string)) // if the word exists
isInArray = true;
word.addURLID(id);
if (!isInArray)
addWordToList(string.toLowerCase(), id);
catch (ParseException e)
e.printStackTrace();
public void addWordToList(String word, int id)
Word wordle = new Word(word, id);
words.add(wordle);
public void addToQueue(String url)
boolean duplicateURL = false;
for (Page page : parsed)
if (page.getURL().equals(url))
duplicateURL = true;
if (!duplicateURL)
toParse.add(url);
totalURLs++;
public void addPageToList(Page p)
boolean exists = false;
for (Page page : parsed)
if (page.equals(p))
exists = true;
if (!exists)
parsed.add(p);
public boolean isInDomain(String url)
if (url.contains(domain))
return true;
return false;
public boolean isValidURL(String url)
boolean valid = false;
if ((url.startsWith("https://") || url.startsWith("http://")))
valid = true;
else
valid = false;
return valid;
java之webmagic网络爬虫(代码片段)
webmagic简介: WebMagic是一个简单灵活的Java爬虫框架。你可以快速开发出一个高效、易维护的爬虫。 http://webmagic.io/ 准备工作: Maven依赖(我这里用的Maven创建的web项目做测试): <dependencies&... 查看详情
java的简单网络爬虫(爬取花瓣网的图片)(代码片段)
...简单的python爬虫,所以在学完java基础后写了一个简单的网络图片爬虫。废话不多说直接上过程代码。(爬取的图源来自花瓣网:https://huaban.com/boards/favorite/beauty/)源url页面分析拿到爬取的源url,首先是分析页面哪些东西是要爬... 查看详情
python项目实战之网络爬虫详解(代码片段)
...述二、原理三、爬虫分类1、传统爬虫2、聚焦爬虫3、通用网络爬虫(全网爬虫)四、网页抓取策略1、宽度优先搜索:2、深度优先搜索:3、最佳优先搜索:4、反向链接数策略:5、PartialPageRank策略:五... 查看详情
java爬虫下载ftp网站目录文件(代码片段)
...写在前面爬虫的本质就是自动化的去模拟正常人类发起的网络请求,然后获取网络请求所返回的数据。跟我们人手动去点击一个连接,访问一个网页获取数据,并没有什么本质的区别。下面用java的方式来爬虫ftp网站... 查看详情
java爬虫入门(代码片段)
通用网络爬虫又称全网爬虫(ScalableWebCrawler),爬行对象从一些种子URL扩充到整个Web,主要为门户站点搜索引擎和大型Web服务提供商采集数据。今天我写的主要是一些皮毛入门现在来看下我们的pom依赖 <projectxmlns="http://maven... 查看详情
爬虫入门(代码片段)
爬虫简单的说网络爬虫(Webcrawler)也叫做网络铲(Webscraper)、网络蜘蛛(Webspider),其行为一般是先“爬”到对应的网页上,再把需要的信息“铲”下来。分类网络爬虫按照系统结构和实现技术,大致可以分为以下几种类型:... 查看详情
text网络爬虫(代码片段)
python网络爬虫(代码片段)
java网络爬虫,就是这么的简单(代码片段)
这是Java网络爬虫系列文章的第一篇,如果你还不知道Java网络爬虫系列文章,请参看学Java网络爬虫,需要哪些基础知识。第一篇是关于Java网络爬虫入门内容,在该篇中我们以采集虎扑列表新闻的新闻标题和详情页... 查看详情
java学习网络编程全总结——tcpudp多线程io流socket简易在线咨询聊天室java爬虫(代码片段)
目录网络编程1.1、概述1.2、网络通信的要素1.3、IP1.4、端口(port)1.5、通信协议1.6、TCP文件上传Tomcat1.7、UDP发送消息构造方法摘要构造方法摘要咨询DatagramPacket.getData()与DatagramPacket.getLength()的误区trimstartsWith在线咨询:两个人都... 查看详情
xpath语法-爬虫(代码片段)
...过Xpath获取网页中我们想要的内容;为我们的后面学习Java网络爬虫基础准备工作。备注:此章节为基础核心章节,未来会在网络爬虫的数据解析环节经常使用,学会Xpath解析语法,可为未来爬虫解析省去很多麻烦。Xpath简介 ... 查看详情
网络爬虫(代码片段)
1.爬虫流程图2.简单爬虫整个网页的内容--python2importurllib2response=urllib2.urlopen("http://www.baidu.com")html=response.read()print(html) 3.中文乱码处理 #coding:utf-8importre#importrequestsimportsysimportcodecs#p 查看详情
javascript一个简单的网络爬虫(代码片段)
java爬爬学习之webmagic(代码片段)
...爬虫的配置、启动和终止Spider爬虫配置Site爬虫分类通用网络爬虫聚焦网络爬虫增量式网络爬虫DeepWeb爬虫案例开发分析数据库表实现流程Scheduler组件三种去重方式使用和定制PipelinePipeline输出已有的Pipeline案例实现引入依赖加入配... 查看详情
网络爬虫是什么?怎么学python爬虫(代码片段)
网络爬虫又称网络蜘蛛、网络机器人,它是一种按照一定的规则自动浏览、检索网页信息的程序或者脚本。网络爬虫能够自动请求网页,并将所需要的数据抓取下来。通过对抓取的数据进行处理,从而提取出有价值的... 查看详情
网络爬虫(代码片段)
1.利用requests.get(url)获取网页页面的html文件importrequestsnewsurl=‘http://news.gzcc.cn/html/xiaoyuanxinwen/‘res=requests.get(newsurl)#返回response对象res.encoding=‘utf-8‘ 2.利用BeautifulSoup的HTML解析器,生成结构树frombs4 查看详情
ruby针对复杂网络的twitter爬虫(代码片段)
python网络爬虫课程设计(代码片段)
...什么?(10分)为了通过爬取网站获取的信息来分析现在网络上社会、经济、技术等各种信息网站的影响力排行,以此了解人们对哪种信息网站更青睐,访问的更加频繁。二、主题式网络爬虫设计方案(10分)1.主题式网络爬虫名... 查看详情