preg_split regex help
In the processPage
function below, I'm grabbing the keywords from the keywords metatag of each URL processed. I need to alter the preg_split
so that it only pulls the first three words of any keyword cluster.
For example, given this keywords meta
tag:
<meta name="keywords" content="this is too long, this is not, keyword three" />
I only want the "this is too" part of the first keyword cluster.
Also, if the total list of keyword phrases is longer than 10, I only want to pull the first 10 keyword phrases from the list.
ie, (keyword phrase 1, kw 2, kw 3, kw4, etc..., keyword phrase 10)
Any help much appreciated.
<?php
class ResultPage
{
function __construct($siteurl){$this->url = $siteurl;$this->processPage();}
public $url;
public $title;
public $html;
public $plainText;
public $wordList;
public $keywords = array();
function processPage(){
$this->html = rseo_keywordSearch_scrapePage($this->url);
$dom = str_get_html($this->html);
$metakws = $dom->find('meta[name=keywords]');
if(count($metakws)){
$metakw = $metakws[0];
if($metakw->content){
$this->keywords = preg_split("/[\s]*[,][\s]*/",$metakw->content); //EDIT HERE
}
}
}
public function GetResults(){
return rseo_keyword_getCountArray($this->wordList);
}
}
/*
*
* Calls remote web page using cUrl,
* and returns the raw html
*
*/
function rseo_keywordSearch_scrapePage($url, $headonly = TRUE ){
$agents = 'Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.204 Safari/534.16';
$ch = curl_init();
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_VERBOSE, FALSE);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);
curl_setopt($ch, CURLOPT_USERAGENT, $agents);
//curl_setopt($ch, CURLOPT_NOBODY, $headonly);
开发者_如何学运维 curl_setopt($ch, CURLOPT_URL, $url);
$curlResp = curl_exec($ch);
curl_close($ch);
$resp = str_replace("class=l","class='l'",$curlResp);
return $resp;
}
function rseo_keyword_getCountArray($arr){
$retarr = array_count_values($arr);
arsort($retarr);
return $retarr;
}
This is a bit easier matching rather than splitting, e.g.:
preg_match_all('/(?<=^|,)\s*((?:[^\s,]+\s*){1,3})/', $metakw->content, $m);
$this->keywords = array_slice($m[1], 0, 10);
print_r($this->keywords);
/*
Array
(
[0] => this is too
[1] => this is not
[2] => keyword three
)
*/
Preg_split is not ideal for what you are trying to do.
I would try something like this:
$keywords = explode(',', $this->content);
foreach ($keywords as $key => $keyword) {
$count = substr_count($keyword, ' ');
if ($count > 2) {
// first 3 words out of a keyword cluster.
$this->keywords[] = implode(' ', explode(' ', $keyword, -($count - 2)));
} else {
$this->keywords[] = $keyword;
}
// stop a 10 keywords
if ($key + 1 == 10) {
break;
}
}
精彩评论