芋の独り言

当ブログへのアクセスは当ブログのプライバシーポリシーに同意したものとみなします.

某サイトのサンプル画像の取得


import requests
from bs4 import BeautifulSoup as bs
import os,wx,sys
from time import sleep

def req_get(url):
    while True:
        try:
            res = requests.get(url, timeout=(3.0, 7.5))
        except (requests.exceptions.Timeout,requests.exceptions.ConnectionError):
            sleep(10)
            continue
        return res

# 保存先フォルダを選択
def path():
    app=wx.App()
    wx.MessageBox('保存先フォルダを選択してください','フォルダ選択',wx.STAY_ON_TOP)
    # フォルダ選択ダイアログを作成
    folda = wx.DirDialog(None,style=wx.DD_CHANGE_DIR | wx.OK | wx.STAY_ON_TOP,message="保存先フォルダ")
    # フォルダが選択されたとき
    if folda.ShowModal() == wx.ID_OK:
        folda_path = folda.GetPath()
        folda.Destroy()
        return folda_path

def download(url,path):
    d=req_get(url)
    sp=bs(d.content,"html.parser")

    try:
        pk=sp.find("div",attrs={"class":"tx10 pd-3 lh4"}).a
    except:
        pk=sp.find("a",attrs={"name":"package-image"})

    if pk:
        title=pk.img.get("alt").replace(":","_").replace("*","").replace("/","_").replace(".","").strip()

        path=os.path.join(path,title)
        try:
            os.mkdir(path)
        except FileExistsError:
            pass
    
        pk=pk.get("href")
        with open(os.path.join(path,title+".jpg"),mode='wb') as f:
            f.write(req_get(pk).content)

        d = False
    else:
        title = sp.find("meta",attrs={"property":"og:title"}).get("content").replace(":","_").replace("*","").replace("/","_").replace(".","").strip()
        d = True

        path=os.path.join(path,title)
        try:
            os.mkdir(path)
        except FileExistsError:
            pass

    print(title,":sample download")
    
    if d:
        img=[i.img.get("src").replace("js-","jp-") for i in sp.find_all("li",attrs={"class":"previewList__item"})]
    else:
        img=[i.img.get("src").replace("-","jp-") for i in sp.find_all('a',attrs={"name":"sample-image"})]

    for i in img:
        with open(os.path.join(path,i.split('/')[-1]),mode="wb") as f:
            f.write(req_get(i).content)

    if sp.find(attrs={"id":"sample-video"}):
        try:
            mv=sp.find("div",attrs={"id":"detail-sample-movie"}).div.a.get("onclick")
            mv=mv.replace("sampleplay('","").replace("');return false;","")+"#sample-video"
            mv=req_get("https://某サイト"+mv)
            mv=req_get(bs(mv.content,"html.parser").find().get("src"))
            mvpath=[]
            sp2=bs(mv.content.decode('unicode-escape'),"html.parser")
            for j in [i.string for i in sp2.find_all("script") if i.string != None]:
                mvpath+=["https:"+k.replace("]","").replace("}","").replace('"src":','').replace('"','').replace("\\","") for k in j.split(',') if "src" in k]
            with open(os.path.join(path,title+".mp4"),mode='wb') as f:
                f.write(req_get(mvpath[-1]).content)
        except AttributeError:
            pass
  
    if d:
        try:            
            zip_path = sp.find("div",attrs={"class":"sampleButton__item"})
            zip_path = zip_path.a.get("href")
            zip_data = req_get(zip_path)
            
            with open(os.path.join(path,"sample.zip"),mode="wb") as f:
                f.write(zip_data.content)

        except AttributeError:
            pass

    if sp.find("span",attrs={"class":"c_icon_productGenre -voice"}):           
        mp4list = [i.get("src") for i in sp.find_all("source",attrs={"type":"video/mp4"})]
        for mu in mp4list:
            with open(os.path.join(path,mu.split('/')[-1]),mode="wb") as f:
                f.write(req_get(mu).content)
           
    print('finish')

if __name__ == "__main__":
    url_list = []
    conti = True
    path = path()
    
    while conti:
        url_list.append(input("url:"))
        
        while True:
            conti = input("continue?(y/n):")
            if conti == 'y':
                conti = True
                break
            elif conti == 'n':
                conti = False
                break
            else:
                continue

    for u in url_list:
        download(u,path)