当前位置：网站首页>College entrance examination admission score line crawler

College entrance examination admission score line crawler

2022-07-02 15:38:00 【jidawanghao】

# -*- coding: utf-8 -*-
'''
 author  : dy
 Development time  : 2021/6/15 17:15
'''
import aiohttp
import asyncio
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import time

current_path = Path.cwd()

def get_url_list(max_id):
    url = 'https://static-data.eol.cn/www/2.0/school/%d/info.json'
    not_crawled = set(range(max_id))
    if Path.exists(Path(current_path, 'college_info.csv')):
        df = pd.read_csv(Path(current_path, 'college_info.csv'))
        not_crawled -= set(df[' School id'].unique())
    return [url%id for id in not_crawled]


async def get_json_data(url, semaphore):
    async with semaphore:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
        }
        async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False), trust_env=True) as session:
            try:
                async with session.get(url=url, headers=headers, timeout=6) as response:
                    #  Change the encoding format of the corresponding data 
                    response.encoding = 'utf-8'
                    #  encounter IO Request to suspend the current task , etc. IO The code after the operation is completed and executed , When the collaboration hangs , Event loops can perform other tasks .
                    json_data = await response.json()
                    if json_data != '':
                        # print(f"{url} collection succeeded!")
                        return save_to_csv(json_data['data'])
            except:
                return None


def save_to_csv(json_info):
    save_info = {}
    save_info[' School id'] = json_info['school_id']              #  School id
    save_info[' School name '] = json_info['name']                  #  The name of the school 
    level = ""
    if json_info['f985'] == '1' and json_info['f211'] == '1':
        level += "985 211"
    elif json_info['f211'] == '1':
        level += "211"
    else:
        level += json_info['level_name']
    save_info[' School level '] = level                               #  School level 
    save_info[' Soft science ranking '] = json_info['rank']['ruanke_rank']    #  Soft science ranking 
    save_info[' Alumni Association ranking '] = json_info['rank']['xyh_rank']     #  Alumni Association ranking 
    save_info[' Ranking of martial arts company '] = json_info['rank']['wsl_rank']     #  Ranking of martial arts company 
    save_info['QS World rankings '] = json_info['rank']['qs_world']     # QS World rankings 
    save_info['US World rankings '] = json_info['rank']['us_rank']      # US World rankings 
    save_info[' School type '] = json_info['type_name']              #  School type 
    save_info[' Province '] = json_info['province_name']              #  Province 
    save_info[' City '] = json_info['city_name']                  #  The city name 
    save_info[' Location '] = json_info['town_name']              #  Location 
    save_info[' Phone number of Admissions Office '] = json_info['phone']                #  Phone number of Admissions Office 
    save_info[' Official website of Admissions Office '] = json_info['site']                 #  Official website of Admissions Office 


    df = pd.DataFrame(save_info, index=[0])

    header = False if Path.exists(Path(current_path, 'college_info.csv')) else True
    df.to_csv(Path(current_path, 'college_info.csv'), index=False, mode='a', header=header)


async def main(loop):
    #  obtain url list 
    url_list =  get_url_list(5000)
    #  Limit the amount of concurrency 
    semaphore = asyncio.Semaphore(500)
    #  Create a task object and add it to the task list 
    tasks = [loop.create_task(get_json_data(url, semaphore)) for url in url_list]
    #  Pending task list 
    for t in tqdm(asyncio.as_completed(tasks), total=len(tasks)):
        await t


if __name__ == '__main__':
    start = time.time()
    #  Modify the strategy of event loop 
    asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
    #  Create an event loop object 
    loop = asyncio.get_event_loop()
    #  Add the task to the event loop and run the loop until it is finished 
    loop.run_until_complete(main(loop))
    #  Close the event loop object 
    loop.close()
    df = pd.read_csv(Path(current_path, 'college_info.csv'))
    df.drop_duplicates(keep='first', inplace=True)
    df.reset_index(drop=True, inplace=True)
    df.sort_values(' School id', inplace=True)
    df.loc[df[' Soft science ranking '] == 0, ' Soft science ranking '] = 999
    df.to_csv(Path(current_path, 'college_info.csv'), index=False)
    print(f' Acquisition complete , Total time consuming ：{round(time.time() - start, 2) }  second ')+680

from ：1 Minutes to crawl the National University Information , Make large screen visualization ！_ Junhong's blog on the road of data analysis -CSDN Blog

原网站

版权声明
本文为[jidawanghao]所创，转载请带上原文链接，感谢
https://yzsam.com/2022/183/202207021208200675.html

当前位置：网站首页>College entrance examination admission score line crawler

College entrance examination admission score line crawler

边栏推荐

猜你喜欢

随机推荐