当前位置 博文首页 > python自动从arxiv下载paper的示例代码

    python自动从arxiv下载paper的示例代码

    作者:dangxusheng 时间:2021-02-02 12:02

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    # @Time  : 2020/02/11 21:44
    # @Author : dangxusheng
    # @Email  : dangxusheng163@163.com
    # @File  : download_by_href.py
    '''
    自动从arxiv.org 下载文献
    '''
    
    import os
    import os.path as osp
    import requests
    from lxml import etree
    from pprint import pprint
    import re
    import time
    import glob
    
    headers = {
      "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36",
      "Host": 'arxiv.org'
    }
    
    HREF_CN = 'http://cn.arxiv.org/pdf/'
    HREF_SRC = 'http://cn.arxiv.org/pdf/'
    SAVE_PATH = '/media/dangxs/E/Paper/download_at_20200730'
    os.makedirs(SAVE_PATH, exist_ok=True)
    
    FAIL_URLS = []
    FAIL_URLS_TXT = f'{SAVE_PATH}/fail_urls.txt'
    
    
    def download(url, title):
      pattern = r'[\\/:*?"\'<>|\r\n]+'
      new_title = re.sub(pattern, " ", title)
      print(f'new title: {new_title}')
      save_filepath = '%s/%s.pdf' % (SAVE_PATH, new_title)
      if osp.exists(save_filepath) and osp.getsize(save_filepath) > 50 * 1024:
        print(f'this pdf is be existed.')
        return True
      try:
        with open(save_filepath, 'wb') as file:
          # 分字节下载
          r = requests.get(url, stream=True, timeout=None)
          for i in r.iter_content(2048):
            file.write(i)
        if osp.getsize(save_filepath) >= 10 * 1024:
          print('%s 下载成功.' % title)
          return True
      except Exception as e:
        print(e)
      return False
    
    
    # 从arxiv.org 去下载
    def search(start_size=0, title_keywords='Facial Expression'):
      # 访问地址: https://arxiv.org/find/grp_eess,grp_stat,grp_cs,grp_econ,grp_math/1/ti:+Face/0/1/0/past,2018,2019/0/1?skip=200&query_id=1c582e6c8afc6146&client_host=cn.arxiv.org
      req_url = 'https://arxiv.org/search/advanced'
      req_data = {
        'advanced': 1,
        'terms-0-operator': 'AND',
        'terms-0-term': title_keywords,
        'terms-0-field': 'title',
        'classification-computer_science': 'y',
        'classification-physics_archives': 'all',
        'classification-include_cross_list': 'include',
        'date-filter_by': 'date_range', # date_range | specific_year
        # 'date-year': DOWN_YEAR,
        'date-year': '',
        'date-from_date': '2015',
        'date-to_date': '2020',
        'date-date_type': 'announced_date_first', # submitted_date | submitted_date_first | announced_date_first
        'abstracts': 'show',
        'size': 50,
        'order': '-announced_date_first',
        'start': start_size,
      }
      res = requests.get(req_url, params=req_data, headers=headers)
      html = res.content.decode()
      html = etree.HTML(html)
    
      total_text = html.xpath('//h1[@class="title is-clearfix"]/text()')
      total_text = ''.join(total_text).replace('\n', '').lstrip(' ').strip(' ')
      # i.e. : Showing 1–50 of 355 results
      num = re.findall('\d+', total_text)
      # Sorry, your query returned no results
      if len(num) == 0: return [], 0
    
      total = int(num[-1]) # 查询总条数
      paper_list = html.xpath('//ol[@class="breathe-horizontal"]/li')
      info_list = []
      for p in paper_list:
        title = p.xpath('./p[@class="title is-5 mathjax"]//text()')
        title = ''.join(title).replace('\n', '').lstrip(' ').strip(' ')
        href = p.xpath('./div/p/a/@href')[0]
        info_list.append({'title': title, 'href': href})
    
      return info_list, total
    
    
    # 去指定页面下载
    def search_special():
      res = requests.get('https://gitee.com/weberyoung/the-gan-zoo?_from=gitee_search')
      html = res.content.decode()
      html = etree.HTML(html)
    
      paper_list = html.xpath('//div[@class="file_content markdown-body"]//li')
      info_list = []
      for p in paper_list:
        title = p.xpath('.//text()')
        title = ''.join(title).replace('\n', '').lstrip(' ').strip(' ')
        href = p.xpath('./a/@href')[0]
        info_list.append({'title': title, 'href': href})
    
      pprint(info_list)
      return info_list
    
    
    if __name__ == '__main__':
      page_idx = 0
      total = 1000
      keywords = 'Facial Action Unit'
      while page_idx <= total // 50:
        paper_list, total = search(page_idx * 50, keywords)
        print(f'total: {total}')
        if total == 0:
          print('no found .')
          exit(0)
    
        for p in paper_list:
          title = p['title']
          href = HREF_CN + p['href'].split('/')[-1] + '.pdf'
          print(href)
          if not download(href, title):
            print('从国内镜像下载失败,从源地址开始下载 >>>>')
            # 使用国际URL再下载一次
            href = HREF_SRC + p['href'].split('/')[-1] + '.pdf'
            if not download(href, title):
              FAIL_URLS.append(p)
        page_idx += 1
    
      # 下载最后的部分
      last_1 = total - page_idx * 50
      paper_list, total = search(last_1, keywords)
      for p in paper_list:
        title = p['title']
        href = HREF_CN + p['href'].split('/')[-1] + '.pdf'
        if not download(href, title):
          FAIL_URLS.append(p)
        time.sleep(1)
    
      pprint(FAIL_URLS)
      with open(FAIL_URLS_TXT, 'a+') as f:
        for item in FAIL_URLS:
          href = item['href']
          title = item['title']
          f.write(href + '\n')
    
      print('done.')
    js