General internet "scraping" question
I just started studying programming ab开发者_如何学编程out 6 months ago and I have really been diving deep into Objective-C. Unfortunately, I don't know any programmers IRL to bounce general questions off of.
What languages are being used when people write programs that will search a website for information and then send it back? For example, if I wanted to write a program that would search weather.com for the daily temperature of the last 30 days in a given location and then send it back as say...an NSArray or NSDictionary, how would i do that? Can I do that in Objective C or is that super-advanced scripting language stuff? If I CAN do it in Objective-C, can someone link to a tutorial or place that may get me started learning that type of stuff? (I don't really know the term for this type of programming so my google searches have been unfruitful.)
I most commonly use PHP and MySQL with CURL
http://en.wikipedia.org/wiki/CURL
You can do some fun things like Search Engine Results Page queries, etc.
Here is the source from a crawler I use. I've cut out some parts for anonymity's sake, but it's a good almost-working example. I can help you get it running if need be.
<?php
class Crawler {
protected $markup = '';
protected $uri = '';
protected $db_location = "localhost";
protected $db_username = "***";
protected $db_password = "***";
protected $db_name = "***";
public function __construct() {
ini_set('memory_limit', -1);
}
public function getMarkup() {
$markup = "";
$markup = @file_get_contents($this->uri);
return $markup;
}
public function get($type) {
$method = "_get_{$type}";
if (method_exists($this, $method)){
return call_user_method($method, $this);
}
}
protected function db_query($query) {
$connection = mysql_connect($this->db_location,$this->db_username,$this->db_password) or die(mysql_error());
mysql_select_db($this->db_name,$connection) or die(mysql_error()." >> ".$query);
//echo $query."<br/>"; //for debugging
$result = mysql_query($query,$connection) or die (mysql_error()." >> ".$query);
$i = 0;
if($result != 1)
{
while ($data_array = mysql_fetch_array($result))
{
foreach($data_array as $key => $value)
{
$tableArray[$i][$key] = stripslashes($data_array[$key]);
}
$i++;
}
return $tableArray;
}
}
protected function db_insert($table,$array) {
$tableArray = $this->db_query("show columns from ".$table);
$inputString = "";
foreach($tableArray as $key => $value)
{
if (array_key_exists($value[0], $array) && $value[0]) {
$inputString .= "'".addslashes($array[$value[0]])."', ";
} else {
$inputString .= "'', ";
}
}
$inputString = substr($inputString, 0, -2);
$this->db_query("insert into $table values(".$inputString.")");
return mysql_insert_id();
}
protected function _get_data() {
//$scrape['id'] = $this->get('id');
$scrape['name'] = $this->get('name');
$scrape['tags'] = $this->get('tags');
$scrape['stat_keys'] = $this->get('stat_keys');
$scrape['stat_values'] = $this->get('stat_values');
foreach($scrape['stat_values'] as $key => $value) {
$scrape['stat_values'][$key] = trim($scrape['stat_values'][$key]);
if(strpos($value,"<h5>Featured Product</h5>")) {
unset($scrape['stat_values'][$key]);
}
if(strpos($value,"<h5>Featured Company</h5>")) {
unset($scrape['stat_values'][$key]);
}
if(strpos($value,"<h5>Featured Type</h5>")) {
unset($scrape['stat_values'][$key]);
}
if(strpos($value,"sign in")) {
unset($scrape['stat_values'][$key]);
}
if(strpos($value,"/100")) {
unset($scrape['stat_values'][$key]);
}
}
if(sizeof($scrape['tags']) > 0 && is_array($scrape['tags'])) {
foreach($scrape['tags'] as $tag) {
$tag_array[$tag] = $tag_array[$tag] + 1;
}
$scrape['tags'] = $tag_array;
foreach($scrape['tags'] as $key => $tag_count) {
$scrape['tags'][$key] = $tag_count - 1;
}
}
$scrape['stat_values'] = array_merge(array(),$scrape['stat_values']);
return $scrape;
}
protected function _get_images() {
if (!empty($this->markup)){
preg_match_all('/<img([^>]+)\/>/i', $this->markup, $images);
return !empty($images[1]) ? $images[1] : FALSE;
}
}
protected function _get_links() {
if (!empty($this->markup)){
preg_match_all('/<a([^>]+)\>(.*?)\<\/a\>/i', $this->markup, $links);
return !empty($links[1]) ? $links[1] : FALSE;
}
}
protected function _get_id() {
if (!empty($this->markup)){
preg_match_all('/\/wine\/view\/([^`]*?)-/', $this->markup, $links);
return !empty($links[1]) ? $links[1] : FALSE;
}
}
protected function _get_grape() {
if (!empty($this->markup)){
preg_match_all('/ class="linked" style="font-size: 14px;">([^`]*?)<\/a>/', $this->markup, $links);
return !empty($links[1]) ? $links[1] : FALSE;
}
}
}
if($_GET['pass'] == "go") {
$crawl = new Crawler();
$crawl->go();
}
?>
So, you want to know how to write server-side code? Well, in theory you can write that in whatever you want. I also assure you it isn't "super-advanced".
You might find it easiest to get started with PHP. W3schools.com has a fine tutorial.
What you are describing is a crawler (e.g. Google).
Any language that has the ability to send HTTP requests and receive responses can do this (which is most languages).
If you don't care to code this thing from scratch, try downloading an open source crawler framework that will allow for custom plugins to parse the resulting HTML.
For your example, you would tell the crawler what site you want it to crawl (i.e. your weather site), add URI constraints if necessary, and create a custom plugin to parse the weather data out of the HTML it responds with. You can then save that data however you see fit.
精彩评论