카테고리 없음

데이터 추출 해버리기 2 (국가안전지수, 치안지수, 살인률) - 작성중

웅수몬 2023. 2. 13. 08:52

 

 

작성중

 

 

 

 

from selenium import webdriver
from selenium.webdriver.support.select import Select
import re
import time

def get_safety_data(): # return : ['Country', 'Safety_index', 'Numbeo_index', 'Homicide_rate']
    

    options = webdriver.ChromeOptions()

    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    

    driver = webdriver.Chrome("./chromedriver", options=options)
    # driver = webdriver.Chrome(options=options)
    driver.implicitly_wait(3)
    
    driver.get('https://globalresidenceindex.com/hnwi-index/safety-index/')
    time.sleep(2)

    select = Select(driver.find_element_by_name('supsystic-table-14_length'))
    select.select_by_visible_text('All')
    select.select_by_value('-1')

    html = driver.page_source

    import requests
    import json
    from bs4 import BeautifulSoup

    soup = BeautifulSoup(html, "html.parser")
    Safety_dict = {}
    Safety_datas = []
    # 국가, 안전지수, 치안지수, 살인률
    Safety_columns = ['Country', 'Safety_index', 'Numbeo_index', 'Homicide_rate']
    table = soup.select_one('#supsystic-table-14')
    trs = table.tbody.select('tr')

    pattern = re.compile(r"([a-zA-Z]+\s?|\-?\d\.?)*")

    for r, tr in enumerate(trs):
        Country = tr.select('td')[1].text
        Safety_index = tr.select('td')[3].text
        Numbeo_index = tr.select('td')[4].text
        Homicide_rate = tr.select('td')[5].text
        Safety_dict_value = [Country, Safety_index, Numbeo_index, Homicide_rate]
        for i in range(len(Safety_dict_value)):
            Safety_dict[Safety_columns[i]] = pattern.search(Safety_dict_value[i]).group()
        Safety_datas.append(dict(Safety_dict))
    Safety_json_data = json.dumps(Safety_datas)

    import pandas as pd
    from pandas import json_normalize

    pd.set_option('display.max_row', 500)

    df = pd.read_json(Safety_json_data)

    duplicate_sum = (df.groupby(['Country'], as_index=False).mean()).round(2)

    duplicate_sum_json = duplicate_sum.to_dict('records')

    return duplicate_sum_json