百度知道的php爬虫

网络整理 - 07-26

<?php
/*
百度知道爬虫！
经过我们分析，百度知道是由静态网页组成。网址格式主要为(编号).html，其中编号便是该问题的编号(PID)，按照时间逐个编号的，由于某些问题的删除或者某种原因，可能编号不连续。当我们发现该问题删除时，可以跳过该问题继续。
*/

class spider
{
private $content ;
private $contentlen ;
private $BestAnswer ;
private $CurPosition ;
function GetStart( $iStart )
{
 return strpos( $this->content , '>' , $iStart )+1 ;
}
function GetContent ( $url )
{
 $this->content = file_get_contents($url);
 $this->contentlen = strlen( $this->content ) ;
 $start = strpos( $this->content , '<title>') ;
 $start = $this->GetStart( $start ) ;
 $end = strpos( $this->content , '</title>' , $start ) ;
 $title = substr( $this->content , $start , $this->$end-$start ) ;
 if ( strpos( $title , '_百度知道' , 1 ) < 1 )
 {
 return false;
 }
 return ture ;
}

function GetTitle()
{
 $start = strpos( $this->content , '<title>') ;
 if ( $start > 0 )
 {
 $start = $this->GetStart( $start ) ;
 $end = strpos( $this->content , '</title>' , $start ) ;
 $this->CurPosition = $end ;
 return substr( $this->content , $start , $end-$start ) ;
 }
 return NULL ;
}
function GetQTitle()
{
 $start = strpos( $this->content , 'span class="question-title"' , $this->CurPosition ) ;
 if ( $start > 0 )
 {
 $start = $this->GetStart( $start ) ;
 $end = strpos( $this->content , '' , $start ) ;
 $this->CurPosition = $end ;
 return substr( $this->content , $start , $end-$start ) ;
 }
 return NULL ;
}
function GetClassFly()
{
 ;
}
function GetQContent()
{
 $start = strpos( $this->content , 'pre id="question-content"' , $this->CurPosition ) ;
 if ( $start > 0 )
 {
 $start = $this->GetStart( $start ) ;
 $end = strpos( $this->content , '</pre>' , $start ) ;
 $this->CurPosition = $end ;
 return substr( $this->content , $start , $end-$start ) ;
 }
 return NULL ;
}
function GetQsuply()
{
 $start = strpos( $this->content , 'id="question-suply"' , $this->CurPosition ) ;
 if ( $start > 0 )
 {
 $start = $this->GetStart( $start ) ;
 $end = strpos( $this->content , '</pre>' , $start ) ;
 $this->CurPosition = $end ;
 return substr( $this->content , $start , $end-$start ) ;
 }
 return NULL ;
}
function GetAnswer()
{
 $start = strpos( $this->content , 'class="reply-text mb10"' , $this->CurPosition ) ;
 if ( $start > 0 )
 {
 $start = $this->GetStart( $start ) ;
 $end = strpos( $this->content , '</pre>' , $start ) ;
 $this->CurPosition = $end ;
 return substr( $this->content , $start , $end-$start ) ;
 }
 return NULL ;
}
}
ini_set('max_execution_time', '0');
$TestSpider = new spider() ;
$Startqid = 1000001 ;
$sndqid = 1000051 ;
$standurl = '' ;
$html = '.html' ;
$url ;
$NoUse = 0 ;
function microtime_float()
{
 list($usec, $sec) = explode(" ", microtime());
 return ((float)$usec + (float)$sec);
}
$time_start = microtime_float();
$answer ;
for ($i = $Startqid ; $i < $sndqid ; $i++ )
{
$url = $standurl.$i.$html ;
if ( $TestSpider->GetContent ( $url ) )
{
 echo ' 正在爬取编号为'.$i.'的网页 ' ;
 $TestSpider->GetTitle() ; //得到网页标题,不用显示了
 echo '问题：<a target="_blank" href="'.$url.'"> '.$TestSpider->GetQTitle().'</a> ' ; //得到问题题目
 echo '问题具体内容：'.$TestSpider->GetQContent().' ' ; //得到问题内容，有可能不存在
 echo '问题补充说明：'.$TestSpider->GetQsuply().' ' ; //问题补充说明，有可能不存在
 while ( ($answer = $TestSpider->GetAnswer()) != NULL )
 {
 echo '问题答案：'.$answer.' ' ; //得到答案。有可能没有答案！
 }
 ob_flush() ;
 flush() ;
}
else
{
 echo '错误了<a target="_blank" href="'.$url.'" style= "color:#ff0000">'.$url.'</a>' ;
 $NoUse++ ;
}
}
$time_end = microtime_float();
$time = $time_end - $time_start;
$i = $i-$Startqid ;
echo '爬取'.$i.'个网页用时'.$time.'秒其中跳过'.$NoUse.'个无效网页！' ;