抓取百度知道的内容到discuz论坛的类
[php]<?php
set_time_limit(0);
mysql_connect("localhost", "root", "123456") or die("Could not connect: " . mysql_error());
mysql_select_db("discuz");
mysql_query("SET character_set_connection=gbk, character_set_results=gbk, character_set_client=binary");
$discuz=new discuz();
$page=5;
$sql="select fid from cdb_forums where fup >0";
$result = mysql_query($sql);
while ($row = mysql_fetch_assoc($result)) {
for($i=1;$i<=$page;$i++){
$discuz->getList($row['fid'],$i);
}
}
?>
<?php
class discuz{
var $tblPre='cdb_';
/**
* 插入用户数据,如果有则不插入
*
* @access public
* @return int 用户ID
*/
function insertMember($username){
$sql="select uid from ".$this->tblPre."members where username ='$username'";
$result = mysql_query($sql);
$row = mysql_fetch_assoc($result);
if($row){
return $row['uid'];
}else{
$regdate =intval(time()-rand(8640000,28640000));
$lastvisit =intval(time()-rand(86400,864000));
$lastpost =$lastvisit+rand(120,1200);
$posts=rand(2,20);
$oltime=rand(5,245);
$extcredits1=rand(50,845);
$sql="insert into ".$this->tblPre."members set username ='$username',regdate=$regdate,lastvisit=$lastvisit,lastpost=$lastpost,lastactivity =$lastvisit,posts=$posts,oltime=$oltime,extcredits1=$extcredits1,groupid =12";
$result = mysql_query($sql);
$uid=mysql_insert_id();
$avatar=rand(1,9);
$sql="replace into ".$this->tblPre."memberfields set nickname ='$username',uid=$uid,avatar='images/avatars/0$avatar.gif',avatarwidth=83,avatarheight=94";
$result = mysql_query($sql);
$thismonth = rand(200,3000);
$total=$thismonth+rand(200,3000);
$sql="insert into ".$this->tblPre."onlinetime set thismonth ='$thismonth',total=$total, uid=$uid";
$result = mysql_query($sql);
return $uid;
}
}
/**
* 插入主题
*
* @access public
* @return int 主题ID
*/
function insertThread($fid,$tid,$subject,$message,$author,$dateline){
//更新用户
$authorid=$this->insertMember($author);
//插入主题
$sql="replace into ".$this->tblPre."threads set tid=$tid,fid=$fid,subject='$subject',authorid=$authorid,author='$author',dateline=$dateline";
$result = mysql_query($sql);
$subject = str_replace("\t", ' ', $subject);
$lastpost = "$tid\t$subject\t$dateline\t$author";
$todayposts=rand(0,1);
$sql="UPDATE ".$this->tblPre."forums SET lastpost='$lastpost', threads=threads+1, posts=posts+1, todayposts=todayposts+$todayposts WHERE fid='$fid'";
$result = mysql_query($sql);
//if(!$result)return false;
//清空回帖
$sql="delete from ".$this->tblPre."posts where tid=$tid";
$result = mysql_query($sql);
//插入1楼帖子
$sql="insert into ".$this->tblPre."posts set fid=$fid,tid=$tid,subject ='$subject',message='$message',authorid=$authorid,author='$author',dateline=$dateline,first =1";
$result = mysql_query($sql);
}
/**
* //插入回帖
*
* @access public
* @return int 回帖ID
*/
function insertPost($fid,$tid,$message,$author,$dateline){
//更新用户
$authorid=$this->insertMember($author);
//插入帖子
$sql="insert into ".$this->tblPre."posts set fid=$fid,tid=$tid,message='$message',authorid=$authorid,author='$author',dateline=$dateline,first =0";
$result = mysql_query($sql);
}
function getOne($fid,$tid){
mysql_query("SET character_set_connection=gbk, character_set_results=gbk, character_set_client=binary");
$data=file_get_contents('http://zhidao.baidu.com/question/'.intval($tid).'.html');
if(!$data)return false;
//获得主题信息
preg_match_all('|<cq>(.*)</cq>(.*)<cd>(.*)</cd>(.*)提问者:(.*)-|isU',$data,$topicarr);
//print_r($topicarr);
//exit;
$topic['title']=trim($topicarr[1][0]);
$topic['content']=trim(strip_tags($topicarr[3][0],'<br>'));
$topic['username']=trim(strip_tags($topicarr[5][0]));
$topic['dateline']=time();
if(strpos($topic['username'],'匿名')>-1)$topic['username']='匿名';
//print_r($topic);
//获得回复内容
//preg_match_all('/<div class="f14 p90 pl10">(.*)<\/div>(.*)回答者:(.*)-(.*)<a href="http:\/\/www.baidu.com\/search\/zhidao_help.html#n5" target=_blank>(.*)<\/a>(.*)<\/div>/isU',$data,$postarr);
preg_match_all('/<div class="f14 p90 pl10">(.*)<\/div>(.*)回答者:(.*)-(.*)target=_blank>(.*)<\/a>(.*)<\/div>/isU',$data,$postarr);
$posts=array();
foreach($postarr[1] as $k=>$v){
$temp['title'] = trim(strip_tags($v,'<br>'));
$temp['username'] = trim(strip_tags($postarr[3][$k]));
$temp['dateline'] = strtotime(date('Y').'-'.trim($postarr[6][$k]) );
if($temp['dateline']>time())$temp['dateline']=strtotime('2007-'.trim($postarr[6][$k]) );
$topic['dateline']=min($temp['dateline'],$topic['dateline']);//推测主题大约时间
$posts[]=$temp;
}
//获得匿名回复的内容
unset($postarr);
preg_match_all('/<div class="f14 p90 pl10">(.*)<\/div>(.*)回答者:匿名(.*)<\/div>/isU',$data,$postarr);
foreach($postarr[1] as $k=>$v){
$temp['title'] = trim(strip_tags($v,'<br>'));
$temp['username'] = '无名';
$temp['dateline'] = strtotime(date('Y').'-'.trim($postarr[3][$k]) );
if($temp['dateline']>time())$temp['dateline']=strtotime('2007-'.trim($postarr[6][$k]) );
$topic['dateline']=min($temp['dateline'],$topic['dateline']);//推测主题大约时间
if(strpos($temp['username'],'匿名')>-1)$temp['username']='无名';
$posts[]=$temp;
}
$topic['dateline']-=1000;
$this->insertThread($fid,$tid,addslashes($topic['title']),addslashes($topic['content']),addslashes($topic['username']),$topic['dateline']);
if($posts){
foreach($posts as $v){
$this->insertPost($fid,$tid,addslashes($v['title']),addslashes($v['username']),$v['dateline']);
}
$replies = count($posts);
$views = rand(80,1000);
$sql="UPDATE ".$this->tblPre."threads SET lastpost=".$v['dateline'].",replies=$replies,views=$views,lastposter ='".$v['username']."' WHERE tid='$tid'";
$result = mysql_query($sql);
}
}
function getList($fid,$page){
$cons=file_get_contents("http://zhidao.baidu.com/browse/$fid?lm=0&pn=".($page-1)*25);
if(!$cons)return false;
//抓取列表URL
preg_match_all('|<span class="f14"><a href="/question/(.*).html" target="_blank">|isU',$cons,$urls);
//print_r($urls);
//抓取内容
foreach($urls[1] as $url){
$this->getone($fid,$url);
sleep(1);
}
}
}
?>
[/php]
[[i] 本帖最后由 songlv 于 2008-3-8 10:07 PM 编辑 [/i]] 牛!顶!日 字数不够! .... 不错呢。。。。 LZ 很不错, LZ很牛啊 LZ很牛啊 好帖 呀 .... 不错呢。。。。
页:
[1]