スクレイピングで化合物データを試薬サイトから収集する

2021年7月5日
2022年2月26日
データ
スクレイピング, データ収集

はじめに

マテリアルズインフォマティクス、材料分野におけるデータ解析では、データがないという問題に直面することが多々ある。試薬サイトには化合物の基礎物性情報が集約されているため、そこからデータを収集してみた。（試薬サイトのサイト名、サイトURLは伏せます）

留意

スクレイピングを行う際は、そのサイトの著作権や利用規約を確認する。（Webページ上の情報を抜き出して再利用することを利用規約で禁じているサイトもある）
サーバーに負荷をかけないようにアクセスの間隔を1秒以上空ける

参考

方針

各試薬情報のページのURLは以下の構成となっており、URLの末尾には試薬コード「△○○○○」が記載されている。
「https://www.XXXX.com/JP/ja/p/△○○○○」

試薬コード「△○○○○」は大文字英数字＋４桁の数字（例｜A0020, M4829）なので、アルファベットと数字の組み合わせを全て試して、化合物情報を取得した。

コード

モジュールインストール

スクレイピングにはBeautifulSoup を用いた。

import pandas as pd
import time
from tqdm import tqdm
from timeout_decorator import timeout, TimeoutError
import urllib.request
from bs4 import BeautifulSoup
import re
import random

関数を定義する。

#化合物情報を取得する関数を定義する
@timeout(60)
def colect_data(code, all_info={}):

    print(code,end=' ')

    if code in all_info.keys():
        pass

    else:
        url = 'https://www.XXXX.com/JP/ja/p/{}'.format(code)
        #XXXX：試薬サイトURL

        #スクレイピングではなく、ブラウザからアクセスしていると見なされるようにするためのコード
        headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0"}

        request = urllib.request.Request(url=url, headers=headers)
        html = urllib.request.urlopen(request)
        html.encoding = "EUC-JP"
        soup = BeautifulSoup(html, "html.parser")

        info_dict = {}

        #タイトルから名前とCAS番号を抜き出す。
        title = soup.find("title").string.split('|')[0].strip()

        if "Page Not Found" in title:
            all_info[code] = {'error':'noInfo'}
            print(":NoInfo")

        else:
            print(':OK')

            try:
                CAS = title.split(' ')[-1]
                name = title.strip(CAS).strip()
                info_dict['name'] = name
                info_dict['CAS'] = CAS
                info_dict['code'] = code
            except:
                pass                

            try:
                #包装単位とその値段を抜き出す
                amount = soup.find_all("td", attrs={"data-attr": "包装単位:"})
                price = soup.find_all("div", attrs={"class": "listPriceNoStrike"})

                for a, p in zip(amount, price):
                    a = re.sub('\\[a-z]','', a.string).strip()
                    p = re.sub('\\[a-z]','', p.string).strip()
                    info_dict[a] = p
            except:
                pass

            #その他のいろいろな情報を抜き出す。
            table = soup.find_all("td")
            try:
                for i, prop in enumerate(table):
                    if (prop.string is not None) and (('融点' in prop.string) or\
                        ('沸点' in prop.string) or ('準位' in prop.string) or \
                        ('屈折率' in prop.string) or ('比重' in prop.string) or \
                        ('溶解' in prop.string) or ('引火点' in prop.string) or \
                        ('外観' in prop.string) or ('溶解' in prop.string) or \
                        ('毒劇法' in prop.string) or ('極大吸収' in prop.string)):
                        key = re.sub('\\[a-z].','', prop.string).strip()
                        value = re.sub('\\[a-z].','', table[i+1].string).strip()
                        info_dict[key] = value
            except:
                pass

            #化審法の情報を抜き出す。
            try:
                for i, prop in enumerate(table):
                    if (prop.string is not None) and ('化審法' in prop.string):
                        temp = table[i+1].text.split('\n')
                        info_dict['化審法_No']= re.sub('\\[a-z].','', temp[-2]).strip()
                        info_dict['化審法']= re.sub('\\[a-z]','', temp[-1]).strip()
            except:
                pass

            #化合物のカテゴリーリストを抜き出す
            try:
                category_list = soup.find_all("div", attrs={"class": "subCategory"})

                for cat in category_list:

                    rootCat = cat.find("h5").text
                    tree_list = cat.find_all("span",class_="startPoint")

                    catList = []
                    for tree in tree_list:
                        catList.append([c.text for c in tree.find_all("a")])
                    info_dict[rootCat] = catList

            except:
                pass

            all_info[code] = info_dict

        return all_info

関数を実行してDataFrameに格納する。

データが巨大になるので、先頭のアルファベットごと（1万化合物ずつ）DataFrameを作成し、エクセルで保存した。

#試薬コードを作成（A0000〜Z9999）
prefix_list = [chr(i) for i in range(65,91)]
code_list = [str(s).zfill(4) for s in range(0,10000)]

#データが巨大になるので、先頭のアルファベットごとpandasDataFrameを作成し、エクセルで保存
for prefix in prefix_list:

    all_info = {}

    for code_ in tqdm(code_list):

        code = prefix + code_

        try:
            all_info = colect_data(code, all_info)
        except:
            print('timeOut')
            all_info[code] = {'error':'timeout'}

        #サーバーに負荷をかけないように３秒間隔をあける
        time.sleep(3)

    info_df = pd.DataFrame(all_info.values(),index=all_info.keys())
    info_df.to_excel('compoundList_{}.xlsx'.format(prefix))

抜き出したデータイメージ

	name	CAS	融点	25G
A0001	Abietic Acid	514-10-3	164 °C	¥6,200
A0002	Ethyl Abietate	631-71-0	NaN	¥14,800
A0003	Acenaphthene	83-32-9	94 °C	¥2,400
⋮	⋮	⋮	⋮	⋮

以上