webscraper grabbing images, but not entering info into database
again. I'm having more issues with my script entering info into my database. The script below grabs a page, strips down the necessary info, then downloads the related image file. After that, it is supposed to enter the information gleaned from the URL into the database. For 开发者_如何学Pythonsome reason, the script seems to iterate through the URLs, as I get downloaded images for each URL, but each URL's product is not entered into the database. The script will insert the first product's categories and product info, and then it just stops, and continues to download images.
Any suggestions?
<?php
define('IN_PHPBB', true);
$phpbb_root_path = (defined('PHPBB_ROOT_PATH')) ? PHPBB_ROOT_PATH : './';
$phpEx = substr(strrchr(__FILE__, '.'), 1);
include($phpbb_root_path . 'common.' . $phpEx);
include($phpbb_root_path . 'includes/simple_html_dom.' . $phpEx);
// Start session management
$user->session_begin();
$auth->acl($user->data);
$user->setup();
set_time_limit(259200);
function save($in, $out)
{
$ch = curl_init ($in);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_BINARYTRANSFER,1);
$rawdata=curl_exec($ch);
curl_close ($ch);
if(file_exists($out))
{
unlink($out);
}
$fp = fopen($out,'x');
fwrite($fp, $rawdata);
fclose($fp);
}
function scrape($i)
{
$url = 'http:/xxxxxxxx/index.php?main_page=product_info&products_id='.$i.'&zenid=e4b7dde8de02e1df005d4549e2e3e529';
echo "$url -- ";
$exists = file_get_contents($url);
if ($exists != false)
{
$html = file_get_html($url);
foreach($html->find('body') as $html)
{
$test = $html->find('#productName', 0);
if ($test)
{
$item['title'] = trim($html->find('#productName', 0)->plaintext);
$item['price'] = trim($html->find('#productPrices', 0)->plaintext);
$item['cat'] = $html->find('#navBreadCrumb', 0)->plaintext;
list($home, $item['cat'], $item['subcat'], $title) = explode("::", $item['cat']);
$item['cat'] = str_replace(" ", "", $item['cat']);
$item['subcat'] = str_replace("\n", "", str_replace(" ", "", $item['subcat']));
$item['desc'] = trim($html->find('#productDescription', 0)->plaintext);
$item['model'] = $html->find('ul#productDetailsList', 0)->find('li', 0)->plaintext;
$item['model'] = explode(":", $item['model']);
$item['model'] = trim($item['model'][1]);
$item['manufacturer'] = $html->find('ul#productDetailsList', 0)->find('li', 1)->plaintext;
$item['manufacturer'] = explode(":", $item['manufacturer']);
$item['manufacturer'] = trim($item['manufacturer'][1]);
foreach($html->find('img') as $img)
{
if($img->alt == $item['title'])
{
$item['img_sm'] = $img->src;
}
}
$ret[] = $item;
}
}
$html->clear();
unset($html);
unset($item);
return $ret;
}
else
{
echo "Could not find page<br />";
}
unset($exists);
}
$i = 1;
$end = 9999999;
while($i < $end)
{
$ret = scrape($i);
if(isset($ret))
{
foreach($ret as $v)
{
$item['title'] = $v['title'];
$item['price'] = $v['price'];
$item['desc'] = $v['desc'];
$item['model'] = $v['model'];
$item['manufacturer'] = $v['manufacturer'];
$item['image'] = $v['image'];
$item['cat'] = $v['cat'];
$item['subcat'] = $v['subcat'];
$item['img_sm'] = $v['img_sm'];
}
unset($ret);
unset($v);
$sm_img_src = "http://xxxxxx/".$item['img_sm'];
$ext = strrchr($item['img_sm'], '.');
$filename = $item['model'] . $ext;
$lg_img_src = "http://xxxxx/images/STC/".$filename;
$new_sm = "./rip_images/small/{$filename}";
$new_lg = "./rip_images/large/{$filename}";
$item['image'] = $filename;
save($lg_img_src,$new_lg);
save($sm_img_src,$new_sm);
//see if parent cat exists
$sql = 'SELECT cat_id FROM ' . SHOP_CAT_TABLE . ' WHERE cat_name = "'.$db->sql_escape($item['cat']).'"';
$result = $db->sql_query($sql);
$parent = $db->sql_fetchrow($result);
$db->sql_freeresult($result);
// if not exists
if($parent['cat_id'] == '')
{
//add the parent cat to the db
$sql_ary = array(
'cat_name' => $item['cat'],
'cat_parent' => 0
);
$sql = 'INSERT INTO '.SHOP_CAT_TABLE.' '.$db->sql_build_array('INSERT', $sql_ary);
$db->sql_query($sql);
$cat_id = $db->sql_nextid();
//see if subcat exists
$sql = 'SELECT cat_id FROM ' . SHOP_CAT_TABLE . ' WHERE cat_name = "'.$db->sql_escape($item['subcat']).'"';
$result = $db->sql_query($sql);
$row = $db->sql_fetchrow($result);
$db->sql_freeresult($result);
// if not exists
if($row['cat_id'] == '')
{
//add subcat to db
$sql_ary = array(
'cat_name' => $db->sql_escape($item['subcat']),
'cat_parent' => $cat_id
);
$sql = 'INSERT INTO '.SHOP_CAT_TABLE.' '.$db->sql_build_array('INSERT', $sql_ary);
$db->sql_query($sql);
$item_cat = $db->sql_nextid();
}
else //if exists
{
$item_cat = $row['cat_id'];
}
}
else //if parent cat exists
{
//see if subcat exists
$sql = 'SELECT cat_id FROM ' . SHOP_CAT_TABLE . ' WHERE cat_name = "'.$db->sql_escape($item['subcat']).'"';
$result = $db->sql_query($sql);
$row = $db->sql_fetchrow($result);
$db->sql_freeresult($result);
// if not exists
if($row['cat_id'] == '')
{
//add the subcat to the db
$sql_ary = array(
'cat_name' => $db->sql_escape($item['subcat']),
'cat_parent' => $parent['cat_id']
);
$sql = 'INSERT INTO '.SHOP_CAT_TABLE.' '.$db->sql_build_array('INSERT', $sql_ary);
$db->sql_query($sql);
$item_cat = $db->sql_nextid();
}
else //if exists
{
$item_cat = $row['cat_id'];
}
}
$sql_ary = array(
'item_title' => $db->sql_escape($item['title']),
'item_price' => $db->sql_escape($item['price']),
'item_desc' => $db->sql_escape($item['desc']),
'item_model' => $db->sql_escape($item['model']),
'item_manufacturer' => $db->sql_escape($item['manufacturer']),
'item_image' => $db->sql_escape($item['image']),
'item_cat' => $db->sql_escape($item_cat)
);
$sql = 'INSERT INTO ' . SHOP_ITEM_TABLE . ' ' . $db->sql_build_array('INSERT', $sql_ary);
$db->sql_query($sql);
garbage_collection();
echo 'Done<br />';
}
$i++;
unset($item);
}
?>
- Have you looked at the actual query strings generated by your DB library?
- Have you added any debugging to see if the queries succeeded? Most PHP db libraries return a boolean FALSE when a query call fails. You're blinding assuming the query succeeded.
精彩评论