python: get google adsense earnings report

2023-02-19 17:23 问答作者：

I need a python script that gets the google adsense earnings and I found adsense scraper: http://pypi.python.org/pypi/adsense_scraper/0.5 It uses Twill and html5lib to scrape google adsense earnings data. When I use it I get this error message:

Traceback (most recent call last):
  File "adsense_scraper.py", line 163, in <module>
    data = main()
  File "adsense_scraper.py", line 154, in main
    b = get_adsense(login, password)
  File "adsense_scraper.py", line 128, in get_adsense
    b.submit()
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\browser.py", line 467, in submit
    self._journey('open', request)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\browser.py", line 523, in _journey
    r = func(*args, **kwargs)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 212, in open
    return self._mech_open(url, data)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 238, in _mech_open
    response = UserAgentBase.open(self, request, data)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 192, in open
    response = meth(req, response)
  File "c:\python26\lib\site-packages\twi开发者_运维知识库ll-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_http.py", line 590, in http_response
   "http", request, response, code, msg, hdrs)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 209, in error
    result = apply(self._call_chain, args)
  File "C:\Python26\lib\urllib2.py", line 361, in _call_chain
    result = func(*args)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_http.py", line 135, in http_error_302
    return self.parent.open(new)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 212, in open
    return self._mech_open(url, data)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 238, in _mech_open
    response = UserAgentBase.open(self, request, data)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 192, in open
    response = meth(req, response)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\utils.py", line 442, in http_response
    "refresh", msg, hdrs)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 209, in error
    result = apply(self._call_chain, args)
  File "C:\Python26\lib\urllib2.py", line 361, in _call_chain
    result = func(*args)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_http.py", line 135, in http_error_302
    return self.parent.open(new)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 212, in open
    return self._mech_open(url, data)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_mechanize.py", line 238, in _mech_open
    response = UserAgentBase.open(self, request, data)
  File "c:\python26\lib\site-packages\twill-0.9-py2.6.egg\twill\other_packages\_mechanize_dist\_opener.py", line 181, in open
    response = urlopen(self, req, data)
  File "C:\Python26\lib\urllib2.py", line 406, in _open 'unknown_open', req)
  File "C:\Python26\lib\urllib2.py", line 361, in _call_chain result = func(*args)
  File "C:\Python26\lib\urllib2.py", line 1163, in unknown_open raise URLError('unknown url type: %s' % type)
urllib2.URLError: <urlopen error unknown url type: 'http>

So the important thing is:

urllib2.URLError: <urlopen error unknown url type: 'http>

Can somebody tell me where the error is? Is there even a better way to get the data via python? Thanks

there are several errors with the package, you mentioned only the first one

1) twill package does not handle google's redirects correctly, adding

    newurl = newurl.strip( "'" )

to twill/other_packages/_mechanize_dist/_http.py:108 before

    newurl = _rfc3986.clean_url(newurl, "latin-1")

fixes that

2) you have to have the correct language set in adsense - English

3) there are several problems in the orignal adsense_scraper

#!/usr/bin/env python
"""Scrapes Google AdSense data with Python using Twill

Current canonical location of this module is here:
http://github.com/etrepum/adsense_scraper/tree/master


Usage::

    from adsense_scraper import get_adsense, get_time_period
    b = get_adsense('YOUR_ADSENSE_LOGIN', 'YOUR_ADSENSE_PASSWORD')
    rows = get_time_period(b, 'yesterday')
    # The summary data is always the first row with channel == ''
    print 'I earned this much yesterday: $%(earnings)s' % rows[0]

"""
# requires html5lib, twill
import sys
import pprint
import decimal
from cStringIO import StringIO
from xml.etree import cElementTree

try:
    from html5lib import HTMLParser
    import twill.commands
except ImportError:
    print >>sys.stderr, """\
adsense_scraper has dependencies::

    Twill 0.9 http://twill.idyll.org/
    html5lib 0.11 http://code.google.com/p/html5lib/

Try this::

    $ easy_install twill html5lib
"""
    raise SystemExit()

__version__ = '0.5'

SERVICE_LOGIN_BOX_URL = "https://www.google.com/accounts/ServiceLogin?service=adsense&rm=hide&fpui=3&nui=15&alwf=true&ltmpl=adsense&passive=true&continue=https%3A%2F%2Fwww.google.com%2Fadsense%2Fgaiaauth2&followup=https%3A%2F%2Fwww.google.com%2Fadsense%2Fgaiaauth2&hl=en_US"
OVERVIEW_URL = "https://www.google.com/adsense/report/overview?timePeriod="

TIME_PERIODS = [
    'today',
    'yesterday',
    'thismonth',
    'lastmonth',
    'sincelastpayment',
]


def parse_decimal(s):
    """Return an int or decimal.Decimal given a human-readable number

    """
    light_stripped = s.strip(u'\u20ac')
    stripped = light_stripped.replace(',', '.').rstrip('%').lstrip('$')
    try:
        int(stripped)
        return light_stripped
    except ValueError:
        pass
    try:
        float(stripped)
        return light_stripped
    except ValueError:
        return decimal.Decimal(stripped)


def parse_summary_table(doc):
    """
    Parse the etree doc for summarytable, returns::

        [{'channel': unicode,
          'impressions': int,
          'clicks': int,
          'ctr': decimal.Decimal,
          'ecpm': decimal.Decimal,
          'earnings': decimal.Decimal}]

    """
    for t in doc.findall('.//table'):
        if t.attrib.get('id') == 'summarytable':
            break
    else:
        raise ValueError("summary table not found")

    res = []
    FIELDS = ['impressions', 'clicks', 'ctr', 'ecpm', 'earnings']
    for row in t.findall('.//tr'):
        celltext = []
        for c in row.findall('td'):
            tail = ''
            # adsense inserts an empty span if a row has a period in it, so
            # get the children and find the tail element to append to the text
            if c.find('a') and c.find('a').getchildren():
                tail = c.find('a').getchildren()[0].tail or ''
            celltext.append('%s%s' % ((c.text or c.findtext('a') or '').strip(), tail.strip()))

        celltext = filter( lambda x: x != "" , celltext )
        if len(celltext) != len(FIELDS):
            continue
        try:
            value_cols = map(parse_decimal, celltext)
        except decimal.InvalidOperation:
            continue
        res.append(dict(zip(FIELDS, value_cols)))

    return res


def get_adsense(login, password):
    """Returns a twill browser instance after having logged in to AdSense
    with *login* and *password*.

    The returned browser will have all of the appropriate cookies set but may
    not be at the exact page that you want data from.

    """
    b = twill.commands.get_browser()
    b.go(SERVICE_LOGIN_BOX_URL)
    for form in b.get_all_forms():
        try:
            form['Email'] = login
            form['Passwd'] = password
        except ValueError:
            continue
        else:
            break
    else:
        raise ValueError("Could not find login form on page")
    b._browser.select_form(predicate=lambda f: f is form)
    b.submit()
    return b


def get_time_period(b, period):
    """Returns the parsed summarytable for the time period *period* given
    *b* which should be the result of a get_adsense call. *period* must be
    a time period that AdSense supports:
    ``'today'``, ``'yesterday'``, ``'thismonth'``,
    ``'lastmonth'``, ``'sincelastpayment'``.

    """
    b.go(OVERVIEW_URL + period)
    # The cElementTree treebuilder doesn't work reliably enough
    # to use directly, so we parse and then dump into cElementTree.
    doc = cElementTree.fromstring(HTMLParser().parse(b.get_html()).toxml())
    return parse_summary_table(doc)


def main():
    try:
        login, password = sys.argv[1:]
    except ValueError:
        raise SystemExit("usage: %s LOGIN PASSWORD" % (sys.argv[0],))
    twill.set_output(StringIO())
    twill.commands.reset_browser()
    b = get_adsense(login, password)
    data = {}
    for period in TIME_PERIODS:
        data[period] = get_time_period(b, period)
    pprint.pprint(data)
    twill.set_output(None)
    return data

if __name__ == '__main__':
    data = main()

继续阅读：html5lib python twill

python: get google adsense earnings report

更多精彩内容

精彩评论

最新问答

央视是哪个频道？

请问买过的朋友，舒提啦旅行箱实际使用体验如何？？

检查不孕不育需要的费用？

海信ULED电视画质有什么不同的地方?？

钉子可以挂的住画框幕布吗？

问答排行榜

河神2九牛入海钓河妖是第几集河妖什么来历可活吞牛？

性激素六项检查的最佳时间是多久？多少钱？？

Easiest way to get words of one line from istream into a vector?

《梦在燃烧 (《三国演义》动画片主题曲)》MP3歌词-汤子星？

抽烟只抽炫赫门？