开发者

PHP Script Memory Leak

I understand this isn't exactly the smallest code, i've tried to cut it down as much as i could. The script just consumes more and more memory until it finally runs out. I've used unset() where possible but it doesn't seem to have any effect. It always seems to error in the MultiGet function but i'm not sure if that is where the leak is. Any input would be greatly appreciated.

public function Test()
{
    $base = dirname(__FILE__) .'/';
    $prod_file = $base.'products.dbf';

    $this->dbf->load($prod_file);
    $num_rec=$ci->dbf->dbf_num_rec;

    $buffer = Array();
    for($i=0;$i<$num_rec;$i++):
        $row = $ci->dbf->getRowAssoc($i);

        $info = Array('part_number' => $row['PART_NUM'],
                      'td_group_id' => $row['GRP'],
                      'name' => 'DESCR');

        $this->db->where('td_group_id',$info['td_group_id']);
        $result = $this->db->get('tbl_categories')->row_array();
        if(isset($result['id'])):
            $info['category_id'] = $result['id'];
            $buffer[]  = $info;
        endif;

        if(count($buffer) == 100 || $i == $num_rec -1):
            $url_buffer = Array();
            foreach($buffer as $row):
                $url_buffer[] = $this->_product_url($row['part_number']);
            endforeach;

            $html_returns = $this->MultiCrawl($url_buffer);
            foreach($html_returns as $url_index=>$html):
                $more_info = $this->_extract_more_info($html);
                if($more_info):
                    $more_info['category_id'] = $buffer[$url_index]['category_id'];
                    $more_info['td_part_number'] = $buffer[$url_index]['part_number'];
                    $this->_parse_product($more_info);
                endif;
            endforeach;
            $buffer = Array();
        endif;

    endfor;



}


function MultiGet($all_urls)
{

    $useragent = $this->_useragent;
    $cookie_file = $this->_cookie_file;

    $url_index = $this->UrlIndex($all_urls);

    $return_buffer = Array();

    $mh = curl_mu开发者_JAVA百科lti_init();

    $ch = Array();
    $max_connections = 15;
    $index = 0;
    $open_connections = 0;
    $execReturnValue = true;
    $running = true;
    $max_index = count($all_urls)-1;
    $url_count = count($all_urls);
    $buffer_count = 0;

    while ($buffer_count < $url_count){

        if($open_connections < $max_connections && $index <= $max_index):
            for($i=$open_connections;$i<$max_connections && $index <= $max_index;$i++):
                $url = $all_urls[$index];
                $ch[$index] = curl_init($url);
                curl_setopt($ch[$index],CURLOPT_FOLLOWLOCATION, true);
                curl_setopt($ch[$index],CURLOPT_RETURNTRANSFER, true);
                curl_setopt($ch[$index],CURLOPT_COOKIESESSION, false);
                curl_setopt($ch[$index],CURLOPT_SSL_VERIFYHOST , false);
                curl_setopt($ch[$index],CURLOPT_SSL_VERIFYPEER , false);
                curl_setopt($ch[$index],CURLOPT_COOKIEJAR, $cookie_file);
                curl_setopt($ch[$index],CURLOPT_COOKIEFILE, $cookie_file);
                curl_setopt($ch[$index],CURLOPT_USERAGENT,$useragent);
                curl_multi_add_handle($mh, $ch[$index]);
                $open_connections++;
                $index++;
                $execReturnValue = curl_multi_exec($mh,$running);
                usleep(200);
            endfor;
        endif;

        $execReturnValue = curl_multi_exec($mh,$running);
        $ready=curl_multi_select($mh);


        while($info=curl_multi_info_read($mh)){
            $status=curl_getinfo($info['handle'],CURLINFO_HTTP_CODE);
            if($status==200){
                $successUrl=curl_getinfo($info['handle'],CURLINFO_EFFECTIVE_URL);
                $curl_index = $url_index[$successUrl];
                $return_buffer[$curl_index] = curl_multi_getcontent($ch[$curl_index]);
                $buffer_count = count($return_buffer);
                curl_multi_remove_handle($mh, $ch[$curl_index]);
                curl_close($ch[$curl_index]);
                unset($ch[$curl_index]);
                $open_connections--;
            }else{

                echo "ERROR: $status\n";
            }
        }
    } 

    curl_multi_close($mh);
    unset($mh);

    return $return_buffer;
}



private function _extract_more_info($html)
{

    $buffer = array();


    $query = "//img[@id='ctl00_cphMain_cntrlProductProfile_imgprodimage']";
    $result = $this->_xquery($html,$query);
    $node = $result instanceof DOMNode?$this->_to_dom_node($result):null;
    if(!$node) return null;
    $buffer['td_img_url'] = $node?trim($node->getAttribute('src')):null;
    unset($result);


    $query = "//span[@class='priceLarge']";
    $result = $this->_xquery($html,$query);
    $node = $result instanceof DOMNode?$this->_to_dom_node($result):null;
    if(!$node) return null;
    $buffer['price'] = $node?trim($node->nodeValue):null;
    if($buffer['price'] == 'Req. Auth.') return null;
    unset($result);


    $query = "//span[@id='ctl00_cphMain_cntrlProductProfile_newLtFinalPrice']";
    $result = $this->_xquery($html,$query);
    $node = $result instanceof DOMNode?$this->_to_dom_node($result):null;
    if(!$node) return null;
    $buffer['msrp'] = $node?trim($node->nodeValue):null;
    unset($result);


    $query = "//span[@id='ctl00_cphMain_cntrlProductProfile_newLTMRF']";
    $result = $this->_xquery($html,$query);
    $node = $result instanceof DOMNode?$this->_to_dom_node($result):null;
    if(!$node) return null;
    $buffer['manf_part_number'] = $node?trim($node->nodeValue):null;
    unset($result);


    $query = "//span[@id='ctl00_cphMain_cntrlProductProfile_newLblUPC']";
    $result = $this->_xquery($html,$query);
    $node = $result instanceof DOMNode?$this->_to_dom_node($result):null;
    $buffer['upc_part_number'] = $node?trim($node->nodeValue):null;
    unset($result);


    $query = "//td[@class='black_text_WUL']";
    $result = $this->_xquery($html,$query);
    $node = $result instanceof DOMNode?$this->_to_dom_node($result):null;
    if(!$node) return null;
    $buffer['manufacturer'] = $node?trim($node->nodeValue):null;
    unset($result);


    $query = "//td[@class='textt' and @colspan='3']";
    $result = $this->_xquery($html,$query);
    $node = $result instanceof DOMNode?$this->_to_dom_node($result):null;
    if(!$node) return null;
    $buffer['short_description'] = $node?trim($node->nodeValue):null;
    unset($result);





    $query = "//div[@id='ctl00_cphMain_pnlMarketingDesc']//td[@class='textt']";
    $result = $this->_xquery($html,$query);
    $node = $result instanceof DOMNode?$this->_to_dom_node($result):null;
    if(!$node) return null;
    $buffer['long_description'] = $node?trim($node->nodeValue):null;
    unset($result);

    $query = "//table[@id='ctl00_cphMain_cntrlMainSpecs_dgSpecs']";
    $result = $this->_xquery($html,$query);
    $table = $result instanceof DOMNode?$this->_to_dom_node($result):null;
    unset($result);

    if(!$table) return null;
    $table_array = Array();
    $rows = $table->getElementsByTagName('tr');
    foreach($rows as $tr):
        $temp = Array();
        $columns = $tr->getElementsByTagName('td');
        $caption = $columns->length > 0 && $columns->length <= 2 ? trim($columns->item(0)->nodeValue) : null;
        $value = $columns->length == 2 ? trim($columns->item(1)->nodeValue) : null;

        if ($caption) $table_array[$caption] = $value;
    endforeach;


    $buffer['main_specs']=$table_array;


    $query = "//table[@id='ctl00_cphMain_cntrlExtSpecs_tblData']";
    $result = $this->_xquery($html,$query);
    $table = $result instanceof DOMNode?$this->_to_dom_node($result):null;
    unset($result);
    $buffer['additional_specs'] = null;
    if(!$table) return $buffer;


    $table_array = Array();
    $rows = $table->getElementsByTagName('tr');
    foreach($rows as $tr):
        $temp = Array();
        $columns = $tr->getElementsByTagName('td');
        $caption = $columns->length > 0 && $columns->length <= 2 ? trim($columns->item(0)->nodeValue) : null;
        $value = $columns->length == 2 ? trim($columns->item(1)->nodeValue) : null;

        if ($caption) $table_array[$caption] = $value;
    endforeach;
    $buffer['additional_specs']=$table_array;;
    return $buffer;

}



private function _xquery($html,$query,$allnodes = false){
    $src = '';
    $dom = new DOMDocument();
    $node = null;
    if (@$dom->loadHTML($html)) {
        $xpath = new DOMXpath($dom);
        $nodeList = $xpath->query($query);
        if ($nodeList->length > 0) {
            $node = $allnodes==false?$nodeList->item(0):$nodeList;
        }
    }
    unset($xpath);
    unset($nodeList);
    unset($dom);
    return $node;
}


Strategies to find a leak?

  • make sure it is a leak (if processing 1/100 of the data, is memory still not freed? 1/1000?)
  • think about complexity: if foo is O(n), bar is O(n) and bar calls foo, the result may become O(n*n).
  • experiment: disable parts of the program until it leaks no more

At first sight, you're crawling a series of url's. These may contain more url's, to be crawled using the MultiCrawl method. Are you sure there can't be a cycle in there? (working with folders has tricked me more than once: browsing '.' as a subfolder yields infinite loops)

0

上一篇:

下一篇:

精彩评论

暂无评论...
验证码 换一张
取 消

最新问答

问答排行榜