How to set name for download images

2022-12-07 22:37 问答作者：

I think I need help with for loop. I have a list of 33 images. For each image I need to create a name: name, reference and number. Every product with the same reference number has to ends with number from 1 to ...x

*Like her:

Can someone give me a hint how to finish the code? Now the products with same reference number are overwritten in the output (filenames).

import requests
from bs4 import BeautifulSoup


productlinks = []

for x in range(1, 2):
    r = requests.get(
        f'https://www.roco.cc/ren/products/locomotives/steam-locomotives.html?p={x}&verfuegbarkeit_status=41%2C42%2C43%2C45%2C44')
    soup = BeautifulSoup(r.content, 'lxml')
    productlist = soup.find_all('li', class_='item product product-item')

    for item in productlist:
        for link in item.find_all('a', class_='product-item-link', href=True):
            productlinks.append(link['href'])


wayslist = []

for url in productlinks :
   r = requests.get(url, allow_redirects=False)
   soup = BeautifulSoup(r.content, 'html.parser')
   images = soup.findAll('img')
   for image in images:
      if 'def' in image['src']:

        name = 'Roco'

        try:
            reference = soup.find(
                'span', class_='product-head-artNr').get_text().strip()
        except Exception as e:
            print(link)

        ways = image['src']
        
        wayslist.append(ways)

        with open(name + '_' + reference + '_' + '.jpg', 'wb') as f:
                im = reque开发者_如何学Csts.get(ways)
                f.write(im.content)
                print('Writing: ', ways)
        
       
print(len(wayslist))

You can use enumerate:

images = soup.findAll('img')

for i, image in enumerate(images):
    if 'def' in image['src']:

    name = 'Roco'

    try:
        reference = soup.find(
            'span', class_='product-head-artNr').get_text().strip()
    except Exception as e:
        print(link)

    ways = image['src']
    
    wayslist.append(ways)

    with open(name + '_' + reference + '_' + str(i + 1) + '.jpg', 'wb') as f:
        im = requests.get(ways)
        f.write(im.content)
        print('Writing: ', ways)

In the output all files starts with number 4 (the end of name before .jpg)

productlinks = []

for x in range(1, 2):
    r = requests.get(
        f'https://www.roco.cc/ren/products/locomotives/steam-locomotives.html?p={x}&verfuegbarkeit_status=41%2C42%2C43%2C45%2C44')
    soup = BeautifulSoup(r.content, 'lxml')
    productlist = soup.find_all('li', class_='item product product-item')

    for item in productlist:
        for link in item.find_all('a', class_='product-item-link', href=True):
            productlinks.append(link['href'])


wayslist = []

for url in productlinks :
   r = requests.get(url, allow_redirects=False)
   soup = BeautifulSoup(r.content, 'html.parser')
   images = soup.findAll('img')
   for i, image in enumerate(images):
      if 'def' in image['src']:

        name = 'Roco'

        try:
            reference = soup.find(
                'span', class_='product-head-artNr').get_text().strip()
        except Exception as e:
            print(link)

        ways = image['src']
        
        wayslist.append(ways)

        with open(name + '_' + reference + '_' + str(i + 1) + '.jpg', 'wb') as f:
                im = requests.get(ways)
                f.write(im.content)
                print('Writing: ', ways)
        
       
print(len(wayslist))

How to set name for download images

Now I'm almost done (please don"t laught about the size of code :)). If someone can help me with 1 thing - the main file in excel is now saved in the folder where should be only images:

import requests
from bs4 import BeautifulSoup
import pandas as pd
import xlsxwriter
import os

baseurl = 'https://www.roco.cc/'

headers = {
    'Accept-Encoding': 'gzip, deflate, sdch',
    'Accept-Language': 'en-US,en;q=0.8',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
}

productlinks = []

for x in range(1, 2):
    r = requests.get(
        f'https://www.roco.cc/ren/products/locomotives/steam-locomotives.html?p={x}&verfuegbarkeit_status=41%2C42%2C43%2C45%2C44')
    soup = BeautifulSoup(r.content, 'lxml')
    productlist = soup.find_all('li', class_='item product product-item')

    for item in productlist:
        for link in item.find_all('a', class_='product-item-link', href=True):
            productlinks.append(link['href'])


loco_list = []


for link in productlinks:
    r = requests.get(link, allow_redirects=False)
    soup = BeautifulSoup(r.content, 'lxml')
    manufacturer_name = 'Roco'

    try:
        reference = soup.find('span', class_='product-head-artNr').text.strip()
    except:
        reference = print(link)

    try:
        price = soup.find('div', class_='product-head-price').text.strip()
    except:
        price = ''

    try:
        type = soup.find(
            'div', class_='product-head-name').h1.text.strip()
    except:
        type = ''

    try:
        scale = soup.find('td', {'data-th': 'Scale'}).text.strip()
    except:
        scale = ''

    try:
        current = soup.find('td', {'data-th': 'Control'}).text.split(' ')[0]
    except:
        current = ''

    try:
        control = soup.find('td', {'data-th': 'Control'}).text.strip()
    except:
        control = ''

    try:
        interface = soup.find('td', {'data-th': 'Interface'}).text.strip()
    except:
        interface = ''

    try:
        digital_decoder = soup.find(
            'td', {'data-th': 'Digital decoder'}).text.strip()
    except:
        digital_decoder = ''

    try:
        decoder_Type = soup.find(
            'td', {'data-th': 'Decoder-Type'}).text.strip()
    except:
        decoder_Type = ''

    try:
        motor = soup.find('td', {'data-th': 'Motor'}).text.strip()
    except:
        motor = ''

    try:
        flywheel = soup.find('td', {'data-th': 'Flywheel'}).text.strip()
    except:
        flywheel = ''

    try:
        minimum_radius = soup.find(
            'td', {'data-th': 'Minimum radius'}).text.strip()
    except:
        minimum_radius = ''

    try:
        length_over_buffer = soup.find(
            'td', {'data-th': 'Length over buffer'}).text.strip()
    except:
        length_over_buffer = ''

    try:
        number_of_driven_axles = soup.find(
            'td', {'data-th': 'Number of  driven axles'}).text.strip()
    except:
        number_of_driven_axles = ''

    try:
        number_of_axles_with_traction_tyres = soup.find(
            'td', {'data-th': 'Number of  axles with traction tyres'}).text.strip()
    except:
        number_of_axles_with_traction_tyres = ''

    try:
        coupling = soup.find('td', {'data-th': 'Coupling'}).text.strip()
    except:
        coupling = ''

    try:
        LED_lighting = soup.find(
            'td', {'data-th': 'LED lighting'}).text.strip()
    except:
        LED_lighting = ''

    try:
        head_light = soup.find('td', {'data-th': 'Head light'}).text.strip()
    except:
        head_light = ''

    try:
        LED_head_light = soup.find(
            'td', {'data-th': 'LED head light'}).text.strip()
    except:
        LED_head_light = ''

    try:
        country = soup.find(
            'td', {'data-th': 'Original (country)'}).text.strip()
    except:
        country = ''

    try:
        railway_company = soup.find(
            'td', {'data-th': 'Railway Company'}).text.strip()
    except:
        railway_company = ''

    try:
        epoch = soup.find('td', {'data-th': 'Epoch'}).text.strip()
    except:
        epoch = ''

    try:
        description = soup.find(
            'div', class_='product-add-form-text').text.strip()
    except:
        description = ''

    Locomotives = {
        'Manufacturer_name': manufacturer_name,
        'Reference': reference,
        'Price': price,
        'Type': type,
        'Scale': scale,
        'Current': current,
        'Control': control,
        'Interface': interface,
        'Digital_decoder': digital_decoder,
        'Decoder_Type': decoder_Type,
        'Motor': motor,
        'Flywheel': flywheel,
        'Minimum_radius': minimum_radius,
        'Length_over_buffer': length_over_buffer,
        'Number_of_driven_axles': number_of_driven_axles,
        'Number_of_axles_with_traction_tyres': number_of_axles_with_traction_tyres,
        'Coupling': coupling,
        'LED_lighting': LED_lighting,
        'Head_light': head_light,
        'LED_head_light': LED_head_light,
        'Country': country,
        'Railway_company': railway_company,
        'Epoch': epoch,
        'Description': description,
    }

    loco_list.append(Locomotives)

spare_part_list = []

for url in productlinks:
    r = requests.get(url, allow_redirects=False)
    soup = BeautifulSoup(r.content, 'lxml')
    try:
        spare_parts = pd.read_html(
            str(soup.select('#product-attribute-et-table')))[0].iloc[:, :3]
        spare_parts['Reference'] = soup.select_one(
            '.product-head-artNr').text.strip()
        spare_parts['Manufacturer name'] = 'Rocco'
        spare_part_list.append(spare_parts)

    except:
        print(url)

wayslist = []
imgslist = []


def imgpath(folder):
    try:
        os.mkdir(os.path.join(os.getcwd(), folder))
    except:
        pass
    os.chdir(os.path.join(os.getcwd(), folder))

    for url in productlinks:
        r = requests.get(url, allow_redirects=False)
        soup = BeautifulSoup(r.content, 'html.parser')
        images = soup.findAll('img')

        for i, image in enumerate(images):
            if 'def' in image['src']:

                name = 'Roco'

                try:
                    reference = soup.find(
                        'span', class_='product-head-artNr').get_text().strip()
                except Exception as e:
                    print(link)

                ways = image['src']

                wayslist.append(ways)

                with open(name + '-' + reference + '-' + str(i - 2) + '.jpg', 'wb') as f:
                    im = requests.get(ways)

                    f.write(im.content)

                imgs = {
                    'Manufacturer_name': name,
                    'Reference': reference,
                    'Photos': (name + '-' + reference +
                               '-' + str(i - 2) + '.jpg'),
                }
                imgslist.append(imgs)


imgpath('Rocco - images')


df1 = pd.DataFrame(loco_list)
df2 = pd.concat(spare_part_list, ignore_index=True)
# df3 = pd.DataFrame()
df4 = pd.DataFrame(imgslist)
writer = pd.ExcelWriter('Roco - locomotives.xlsx', engine='xlsxwriter')
df1.to_excel(writer, sheet_name='Model')
df2.to_excel(writer, sheet_name='Spare parts')
# # # # df3.to_excel(writer, sheet_name='Documents')
df4.to_excel(writer, sheet_name='Photos')
writer.save()

print('Saved to file')

继续阅读：imagedownload python web-scraping

How to set name for download images

更多精彩内容

精彩评论

最新问答

央视是哪个频道？

请问买过的朋友，舒提啦旅行箱实际使用体验如何？？

检查不孕不育需要的费用？

海信ULED电视画质有什么不同的地方?？

钉子可以挂的住画框幕布吗？

问答排行榜

河神2九牛入海钓河妖是第几集河妖什么来历可活吞牛？

性激素六项检查的最佳时间是多久？多少钱？？

Easiest way to get words of one line from istream into a vector?

《梦在燃烧 (《三国演义》动画片主题曲)》MP3歌词-汤子星？

抽烟只抽炫赫门？

更多精彩内容

精彩评论

最新问答

央视是哪个频道？

请问买过的朋友，舒提啦旅行箱实际使用体验如何？？

检查不孕不育需要的费用？

海信ULED电视画质有什么不同的地方?？

钉子可以挂的住画框幕布吗？

问答排行榜

河神2九牛入海钓河妖是第几集 河妖什么来历可活吞牛？

性激素六项检查的最佳时间是多久？多少钱？？

Easiest way to get words of one line from istream into a vector?

《梦在燃烧 (《三国演义》动画片主题曲)》MP3歌词-汤子星？

抽烟只抽炫赫门？

河神2九牛入海钓河妖是第几集河妖什么来历可活吞牛？