contentextractor.getnewsbyurl的簡單介紹

本文目錄一覽：

在做網頁，在my eclipse 中出現java.sql.SQLException: No value specified for parameter 1要怎麼處理呢

報錯為：SQL第一個條件參數沒有值傳入。

說明參數傳遞的有問題或者沒傳遞，就會報這個錯誤。

舉例：

String sql = “select * from users where id=? and passwd=?”;

rs = dealDateBase，getRS(sql， user，getUsername()，user，getUserPas());

備註：傳遞的參數個數必須與賦值的個數類型一致才可以，否則就會報錯的。

拓展資料

第一步：新建web工程後，把mysql的連接驅動程序放在lib目錄里。

第二步：編寫資料庫連接程序：import java，sql，Connection;

import java，sql，DriverManager;

public class DbConnection {

@SuppressWarnings(“finally”)

public Connection getConnection()

{

String driver =”com，mysql，jdbc，Driver”;

String url=”jdbc:mysql://localhost:3306/newssystem”;

String user=”root”;

String password=”0211″;

Connection conn=null;

//載入驅動程序以連接資料庫

try {

Class，forName( driver );

conn = DriverManager，getConnection(

url， user， password );

}

catch ( ClassNotFoundException cnfex ) {

System，err，println(“資料庫連接異常！！”+cnfex，getMessage());

}finally

{

return conn;

}

第三步：編寫資料庫操作dao類，就是對數據的增刪查改。再給你舉一個例子吧，這是之前做的一個小項目里，直接複製過來，你參考參考。

public class NewsDao {

Connection con;

// 添加數據

public void insertNews(News news) {

String sql = “insert into news values(0，?，?，?，now()，?，?)”;

try {

DbConnection db = new DbConnection();

con = db，getConnection();

PreparedStatement ps = con，prepareStatement(sql);

ps，setString(1， news，getTitle());

ps，setString(2， news，getContent());

ps，setString(3， news，getAuthor());

//ps，setString(4， news，getDate());

ps，setString(4， news，getSort());

ps，setString(5， news，getImage());

ps，executeUpdate();

} catch (Exception e) {

System，err，println(“資料庫有誤：” + e，getMessage());

} finally {

try {

if (con != null)

con，close();

} catch (Exception e2) {

System，err，println(“資料庫關閉有誤：” + e2，getMessage());

}

第四步：在相應的操作類中調用dao類；

這是按照平時的項目實施來的，如果你只是想測試連接資料庫，可以這幾部綜合在一起。不過最後還是按照嚴格的分層來吧，養成一個好習慣！

如何使用webcollector爬取搜索引擎

使用webcollector爬取搜索引擎，按照關鍵字搜索的結果頁面，解析規則可能會隨百度搜索的改版而失效。

代碼如下：

[java] view plain copy

package com.wjd.baidukey.crawler;

import java.io.ByteArrayInputStream;

import java.io.FileOutputStream;

import java.io.IOException;

import java.net.URLEncoder;

import java.sql.Connection;

import java.sql.DriverManager;

import java.sql.PreparedStatement;

import java.sql.ResultSet;

import java.sql.SQLException;

import java.sql.Statement;

import java.text.SimpleDateFormat;

import java.util.Date;

import java.util.HashMap;

import java.util.TimeZone;

import org.apache.poi.poifs.filesystem.DirectoryEntry;

import org.apache.poi.poifs.filesystem.POIFSFileSystem;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;

import cn.edu.hfut.dmic.contentextractor.ContentExtractor;

import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;

import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;

import cn.edu.hfut.dmic.webcollector.model.Page;

import cn.edu.hfut.dmic.webcollector.plugin.ram.RamCrawler;

public class BdiduKeywordCrawler extends RamCrawler{

private Connection connection;

private PreparedStatement pstatement;

// 連接MySql資料庫，用戶名root，密碼mahao

String url = “jdbc:mysql://localhost:3306/wjd”;

String username = “root”;

String password = “mahao”;

//保存抽取到的數據

StringBuilder result = new StringBuilder();

public BdiduKeywordCrawler(String keyword, int maxPageNum) throws Exception {

for (int pageNum = 1; pageNum = maxPageNum; pageNum++) {

String url = createUrl(keyword, pageNum);

CrawlDatum datum = new CrawlDatum(url)

.putMetaData(“keyword”, keyword)

.putMetaData(“pageNum”, pageNum + “”)

.putMetaData(“pageType”, “searchEngine”)

.putMetaData(“depth”, “1”);

addSeed(datum);

}

@Override

public void visit(Page page, CrawlDatums next) {

String keyword = page.getMetaData(“keyword”);

String pageType = page.getMetaData(“pageType”);

int depth = Integer.valueOf(page.getMetaData(“depth”));

if (pageType.equals(“searchEngine”)) {

int pageNum = Integer.valueOf(page.getMetaData(“pageNum”));

System.out.println(“成功抓取關鍵詞” + keyword + “的第” + pageNum + “頁搜索結果”);

// || div[class=result-op c-container xpath-log ]h3a

Elements results = page.select(“div[class=result c-container ]h3a”);

// Elements results1 = page.select(“div[class=result-op c-container xpath-log]h3a”);//,div[id=result-op c-container xpath-log]h3a

//System.out.println(results1.get(0));

//results.add(results1.get(0));

for (int rank = 0; rank results.size(); rank++) {

Element result = results.get(rank);

* 我們希望繼續爬取每條搜索結果指向的網頁，這裡統稱為外鏈。

* 我們希望在訪問外鏈時仍然能夠知道外鏈處於搜索引擎的第幾頁、第幾條，

* 所以將頁號和排序信息放入後續的CrawlDatum中，為了能夠區分外鏈和

* 搜索引擎結果頁面，我們將其pageType設置為outlink，這裡的值完全由用戶定義，可以設置一個任意的值

* 在經典爬蟲中，每個網頁都有一個refer信息，表示當前網頁的鏈接來源。

* 例如我們首先訪問新浪首頁，然後從新浪首頁中解析出了新的新聞鏈接，

* 則這些網頁的refer值都是新浪首頁。WebCollector不直接保存refer值，

* 但我們可以通過下面的方式，將refer信息保存在metaData中，達到同樣的效果。

* 經典爬蟲中錨文本的存儲也可以通過下面方式實現。

* 在一些需求中，希望得到當前頁面在遍歷樹中的深度，利用metaData很容易實現

* 這個功能，在將CrawlDatum添加到next中時，將其depth設置為當前訪問頁面的depth+1即可。

CrawlDatum datum = new CrawlDatum(result.attr(“abs:href”))

.putMetaData(“keyword”, keyword)

.putMetaData(“pageNum”, pageNum + “”)

.putMetaData(“rank”, rank + “”)

.putMetaData(“pageType”, “outlink”)

.putMetaData(“depth”, (depth + 1) + “”)

.putMetaData(“refer”, page.getUrl());

next.add(datum);

}

} else if (pageType.equals(“outlink”)) {

/*int pageNum = Integer.valueOf(page.getMetaData(“pageNum”));

int rank = Integer.valueOf(page.getMetaData(“rank”));

String refer = page.getMetaData(“refer”);*/

try {

String content = ContentExtractor.getContentByUrl(page.getUrl());

/*String line = String.format(

“第%s頁第%s個結果:標題:%s(%s位元組)\tdepth=%s\trefer=%s”, pageNum,

rank + 1, page.getDoc().title(), content,

depth, refer);*/

String line = String.format(“標題：%s\n來源：%s\n正文：%s”, page.getDoc().title(),page.getUrl(),content);

HashMapString, String data = new HashMapString,String();

Date currentDate = new java.util.Date();

SimpleDateFormat myFmt = new SimpleDateFormat(“yyyy年MM月dd日 HH:mm:ss”);

TimeZone timeZoneChina = TimeZone.getTimeZone(“Asia/Shanghai”);// 獲取中國的時區

myFmt.setTimeZone(timeZoneChina);// 設置系統時區

String grabTime = myFmt.format(currentDate);// new Date()為獲取當前系統時間

data.put(“title”, page.getDoc().title());

data.put(“from”, page.getUrl());

data.put(“content”, content);

data.put(“grabTime”, grabTime);

//String line = String.format(“標題：%s\n”, page.getDoc().title());

//持久化到word文檔中

//是否為線程安全？？？

//synchronized(this) {

String destFile = “D:\\”+”Result”+keyword+”.doc”;

result.append(line);

//將result寫到doc文件中

write2File(destFile,result.toString());

//添加到資料庫中

addResultData(data);

//}

System.out.println(line);

} catch (Exception e) {

//e.printStackTrace();

System.out.println(“鏈接”+page.getUrl()+”失效”);

}

//將數據保存到mysql資料庫中

private void addResultData(HashMapString, String data) {

String title = data.get(“title”);

String source_url = data.get(“from”);

String content = data.get(“content”).replaceAll(“\\?{2,}”, “”);//去掉字元串中出現的多個連續問號。

//抓取時間

String grabTime = data.get(“grabTime”);

/*SimpleDateFormat format = new SimpleDateFormat(“yyyy年MM月dd日 HH:mm:ss”);

Date date = null;

try {

date = format.parse(grabTime);

} catch (Exception e) {

e.printStackTrace();

}*/

//System.out.println(“抓取時間”+grabTime);

try {

connection = DriverManager.getConnection(url, username, password);

String sql = “INSERT INTO wjd_keyword_search_table(TITLE,GRAP_TIME,CONTENT,SOURCE_URL) VALUES(?,?,?,?)”;

String checkSql = “select 1 from wjd_keyword_search_table where TITLE='” + title + “‘”;

Statement statement = connection.prepareStatement(checkSql);

ResultSet result = statement.executeQuery(checkSql);

if (!result.next()) {

// 如果資料庫中不存在該記錄，則添加到資料庫中

pstatement = connection.prepareStatement(sql);

pstatement.setString(1, title);

//pstatement.setString(2, date);

pstatement.setString(2,grabTime);

pstatement.setString(3, content);

pstatement.setString(4, source_url);

pstatement.executeUpdate();

}

} catch (SQLException e) {

e.printStackTrace();

}

/**

* 將數據持久化到本地doc文件中

* @param destFile

* @param line

private void write2File(String destFile, String line) {

try {

//doc content

ByteArrayInputStream bais = new ByteArrayInputStream(line.getBytes());

POIFSFileSystem fs = new POIFSFileSystem();

DirectoryEntry directory = fs.getRoot();

directory.createDocument(“WordDocument”, bais);

FileOutputStream ostream = new FileOutputStream(destFile);

fs.writeFilesystem(ostream);

bais.close();

ostream.close();

} catch (IOException e) {

e.printStackTrace();

}

public static void main(String[] args) throws Exception {

String[] keywordsList = {“網路爬蟲”,”搜索引擎”};

int pageToal =5;

for (String keyword : keywordsList) {

BdiduKeywordCrawler crawler = new BdiduKeywordCrawler(keyword, pageToal);

crawler.start();

}

/**

* 根據關鍵詞和頁號拼接百度搜索對應的URL

public static String createUrl(String keyword, int pageNum)

throws Exception {

int first = (pageNum-1) * 10;

keyword = URLEncoder.encode(keyword, “utf-8”);

return String.format(“;pn=%s”,

keyword, first);

}

python3 怎麼爬取新聞網站

需求：

從門戶網站爬取新聞，將新聞標題，作者，時間，內容保存到本地txt中。

用到的python模塊：

import re # 正則表達式

import bs4 # Beautiful Soup 4 解析模塊

import urllib2 # 網路訪問模塊

import News #自己定義的新聞結構

import codecs #解決編碼問題的關鍵，使用codecs.open打開文件

import sys #1解決不同頁面編碼問題

其中bs4需要自己裝一下，安裝方法可以參考：Windows命令行下pip安裝python whl包

程序：

#coding=utf-8

import re # 正則表達式

import bs4 # Beautiful Soup 4 解析模塊

import urllib2 # 網路訪問模塊

import News #自己定義的新聞結構

import codecs #解決編碼問題的關鍵，使用codecs.open打開文件

import sys #1解決不同頁面編碼問題

reload(sys) # 2

sys.setdefaultencoding(‘utf-8’) # 3

# 從首頁獲取所有鏈接

def GetAllUrl(home):

html = urllib2.urlopen(home).read().decode(‘utf8’)

soup = bs4.BeautifulSoup(html, ‘html.parser’)

pattern = ‘http://\w+\.baijia\.baidu\.com/article/\w+’

links = soup.find_all(‘a’, href=re.compile(pattern))

for link in links:

url_set.add(link[‘href’])

def GetNews(url):

global NewsCount,MaxNewsCount #全局記錄新聞數量

while len(url_set) != 0:

try:

# 獲取鏈接

url = url_set.pop()

url_old.add(url)

# 獲取代碼

html = urllib2.urlopen(url).read().decode(‘utf8’)

# 解析

soup = bs4.BeautifulSoup(html, ‘html.parser’)

pattern = ‘http://\w+\.baijia\.baidu\.com/article/\w+’ # 鏈接匹配規則

links = soup.find_all(‘a’, href=re.compile(pattern))

# 獲取URL

for link in links:

if link[‘href’] not in url_old:

url_set.add(link[‘href’])

# 獲取信息

article = News.News()

article.url = url # URL信息

page = soup.find(‘div’, {‘id’: ‘page’})

article.title = page.find(‘h1’).get_text() # 標題信息

info = page.find(‘div’, {‘class’: ‘article-info’})

article.author = info.find(‘a’, {‘class’: ‘name’}).get_text() # 作者信息

article.date = info.find(‘span’, {‘class’: ‘time’}).get_text() # 日期信息

article.about = page.find(‘blockquote’).get_text()

pnode = page.find(‘div’, {‘class’: ‘article-detail’}).find_all(‘p’)

article.content = ”

for node in pnode: # 獲取文章段落

article.content += node.get_text() + ‘\n’ # 追加段落信息

SaveNews(article)

print NewsCount

break

except Exception as e:

print(e)

continue

else:

print(article.title)

NewsCount+=1

finally:

# 判斷數據是否收集完成

if NewsCount == MaxNewsCount:

break

def SaveNews(Object):

file.write(“【”+Object.title+”】”+”\t”)

file.write(Object.author+”\t”+Object.date+”\n”)

file.write(Object.content+”\n”+”\n”)

url_set = set() # url集合

url_old = set() # 爬過的url集合

NewsCount = 0

MaxNewsCount=3

home = ” # 起始位置

GetAllUrl(home)

file=codecs.open(“D:\\test.txt”,”a+”) #文件操作

for url in url_set:

GetNews(url)

# 判斷數據是否收集完成

if NewsCount == MaxNewsCount:

break

file.close()

新聞文章結構

#coding: utf-8

# 文章類定義

class News(object):

def __init__(self):

self.url = None

self.title = None

self.author = None

self.date = None

self.about = None

self.content = None

對爬取的文章數量就行統計。

現在我想這個鏈接跳轉到一個action方法，那麼AAA應該怎麼寫呢？

tda href=”%=request.getContextPath()%/news.do?method=edit”編輯新聞/a/td

試試樓主,不行把strtus-config貼出來我看看

原創文章，作者：小藍，如若轉載，請註明出處：https://www.506064.com/zh-tw/n/290924.html

contentextractor.getnewsbyurl的簡單介紹

本文目錄一覽：

在做網頁，在my eclipse 中出現java.sql.SQLException: No value specified for parameter 1要怎麼處理呢

如何使用webcollector爬取搜索引擎

python3 怎麼爬取新聞網站

現在我想這個鏈接跳轉到一個action方法，那麼AAA應該怎麼寫呢？

相關推薦

發表回復