开发者

webscraper grabbing images, but not entering info into database

again. I'm having more issues with my script entering info into my database. The script below grabs a page, strips down the necessary info, then downloads the related image file. After that, it is supposed to enter the information gleaned from the URL into the database. For 开发者_如何学Pythonsome reason, the script seems to iterate through the URLs, as I get downloaded images for each URL, but each URL's product is not entered into the database. The script will insert the first product's categories and product info, and then it just stops, and continues to download images.

Any suggestions?

<?php

define('IN_PHPBB', true);
$phpbb_root_path = (defined('PHPBB_ROOT_PATH')) ? PHPBB_ROOT_PATH : './';
$phpEx = substr(strrchr(__FILE__, '.'), 1);
include($phpbb_root_path . 'common.' . $phpEx);
include($phpbb_root_path . 'includes/simple_html_dom.' . $phpEx);

// Start session management
$user->session_begin();
$auth->acl($user->data);
$user->setup();

set_time_limit(259200);

function save($in, $out)
{
    $ch = curl_init ($in);
    curl_setopt($ch, CURLOPT_HEADER, 0);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($ch, CURLOPT_BINARYTRANSFER,1);
    $rawdata=curl_exec($ch);
    curl_close ($ch);
    if(file_exists($out))
    {
        unlink($out);
    }
    $fp = fopen($out,'x');
    fwrite($fp, $rawdata);
    fclose($fp);
}

function scrape($i)
{
    $url = 'http:/xxxxxxxx/index.php?main_page=product_info&products_id='.$i.'&zenid=e4b7dde8de02e1df005d4549e2e3e529';
    echo "$url -- ";
    $exists = file_get_contents($url);
    if ($exists != false)
    {
        $html = file_get_html($url); 

        foreach($html->find('body') as $html)
        {
            $test = $html->find('#productName', 0);
            if ($test)
            {
                $item['title'] = trim($html->find('#productName', 0)->plaintext);
                $item['price'] = trim($html->find('#productPrices', 0)->plaintext);
                $item['cat'] = $html->find('#navBreadCrumb', 0)->plaintext;
                list($home, $item['cat'], $item['subcat'], $title) = explode("::", $item['cat']);
                $item['cat'] = str_replace("&nbsp;", "", $item['cat']);
                $item['subcat'] = str_replace("\n", "", str_replace("&nbsp;", "", $item['subcat']));
                $item['desc'] = trim($html->find('#productDescription', 0)->plaintext);
                $item['model'] = $html->find('ul#productDetailsList', 0)->find('li', 0)->plaintext;
                $item['model'] = explode(":", $item['model']);
                $item['model'] = trim($item['model'][1]);
                $item['manufacturer'] = $html->find('ul#productDetailsList', 0)->find('li', 1)->plaintext;
                $item['manufacturer'] = explode(":", $item['manufacturer']);
                $item['manufacturer'] = trim($item['manufacturer'][1]);
                foreach($html->find('img') as $img)
                {
                    if($img->alt == $item['title'])
                    {
                        $item['img_sm'] = $img->src;
                    }
                }

                $ret[] = $item;

            }
        }
        $html->clear();
        unset($html);
        unset($item);
        return $ret;
    }
    else
    {
        echo "Could not find page<br />";
    }
    unset($exists);
}

$i = 1;
$end = 9999999;

while($i < $end)
{
    $ret = scrape($i);

    if(isset($ret))
    {
        foreach($ret as $v)
        {
            $item['title'] = $v['title'];
            $item['price'] = $v['price'];
            $item['desc'] = $v['desc'];
            $item['model'] = $v['model'];
            $item['manufacturer'] = $v['manufacturer'];
            $item['image'] = $v['image'];
            $item['cat'] = $v['cat'];
            $item['subcat'] = $v['subcat'];
            $item['img_sm'] = $v['img_sm'];
        }
        unset($ret);
        unset($v);

        $sm_img_src = "http://xxxxxx/".$item['img_sm'];
        $ext = strrchr($item['img_sm'], '.');

        $filename = $item['model'] . $ext;

        $lg_img_src = "http://xxxxx/images/STC/".$filename;
        $new_sm = "./rip_images/small/{$filename}";
        $new_lg = "./rip_images/large/{$filename}";

        $item['image'] = $filename;

        save($lg_img_src,$new_lg);
        save($sm_img_src,$new_sm);

        //see if parent cat exists
        $sql = 'SELECT cat_id FROM ' . SHOP_CAT_TABLE . ' WHERE cat_name = "'.$db->sql_escape($item['cat']).'"';
        $result = $db->sql_query($sql);
        $parent = $db->sql_fetchrow($result);
        $db->sql_freeresult($result);
        // if not exists
        if($parent['cat_id'] == '')
        {
            //add the parent cat to the db
            $sql_ary = array(
                'cat_name' => $item['cat'],
                'cat_parent' => 0
            );
            $sql = 'INSERT INTO '.SHOP_CAT_TABLE.' '.$db->sql_build_array('INSERT', $sql_ary);
            $db->sql_query($sql);
            $cat_id = $db->sql_nextid();

            //see if subcat exists
            $sql = 'SELECT cat_id FROM ' . SHOP_CAT_TABLE . ' WHERE cat_name = "'.$db->sql_escape($item['subcat']).'"';
            $result = $db->sql_query($sql);
            $row = $db->sql_fetchrow($result);
            $db->sql_freeresult($result);
            // if not exists
            if($row['cat_id'] == '')
            {
                //add subcat to db
                $sql_ary = array(
                    'cat_name' => $db->sql_escape($item['subcat']),
                    'cat_parent' => $cat_id
                );
                $sql = 'INSERT INTO '.SHOP_CAT_TABLE.' '.$db->sql_build_array('INSERT', $sql_ary);
                $db->sql_query($sql);
                $item_cat = $db->sql_nextid();
            }
            else //if exists
            {
                $item_cat = $row['cat_id'];
            }
        }
        else //if parent cat exists
        {
            //see if subcat exists
            $sql = 'SELECT cat_id FROM ' . SHOP_CAT_TABLE . ' WHERE cat_name = "'.$db->sql_escape($item['subcat']).'"';
            $result = $db->sql_query($sql);
            $row = $db->sql_fetchrow($result);
            $db->sql_freeresult($result);
            // if not exists
            if($row['cat_id'] == '')
            {
                //add the subcat to the db
                $sql_ary = array(
                    'cat_name' => $db->sql_escape($item['subcat']),
                    'cat_parent' => $parent['cat_id']
                );
                $sql = 'INSERT INTO '.SHOP_CAT_TABLE.' '.$db->sql_build_array('INSERT', $sql_ary);
                $db->sql_query($sql);
                $item_cat = $db->sql_nextid();
            }
            else //if exists
            {
                $item_cat = $row['cat_id'];
            }
        }

        $sql_ary = array(
            'item_title'      => $db->sql_escape($item['title']),
            'item_price'     => $db->sql_escape($item['price']),
            'item_desc'      => $db->sql_escape($item['desc']),
            'item_model'    => $db->sql_escape($item['model']),
            'item_manufacturer' => $db->sql_escape($item['manufacturer']),
            'item_image'    => $db->sql_escape($item['image']),
            'item_cat'      => $db->sql_escape($item_cat)
        );

        $sql = 'INSERT INTO ' . SHOP_ITEM_TABLE . ' ' . $db->sql_build_array('INSERT', $sql_ary);
        $db->sql_query($sql);
        garbage_collection();
        echo 'Done<br />';
    }
    $i++;
    unset($item);
}

?>


  1. Have you looked at the actual query strings generated by your DB library?
  2. Have you added any debugging to see if the queries succeeded? Most PHP db libraries return a boolean FALSE when a query call fails. You're blinding assuming the query succeeded.
0

上一篇:

下一篇:

精彩评论

暂无评论...
验证码 换一张
取 消

最新问答

问答排行榜