当前位置 博文首页 > python操作xml文件示例

    python操作xml文件示例

    作者:admin 时间:2021-07-04 17:47

    复制代码 代码如下:

    def get_seed_data(filename):
    dom = minidom.parse(filename)
    root = dom.documentElement
    system_nodes = root.getElementsByTagName("system")
    k = 0
    seed_list = []
    for system_node in system_nodes:
        #print system_node.nodeName+' id='+system_node.getAttribute('id')
        system_id = system_node.getAttribute("id")
        system_name = system_node.getAttribute("name")
        #print 'system_name:%s'%system_name
        section_nodes = system_node.getElementsByTagName("section")
        for section_node in section_nodes:
                section_id = section_node.getAttribute('id')
                section_name = section_node.getAttribute('name')
                #print ' '+section_node.nodeName+' id='+section_id+' name='+section_name
                crawl_cycle_node = section_node.getElementsByTagName("crawl_cycle")
                crawl_cycle = crawl_cycle_node[0].childNodes[0].nodeValue
                #print '  '+crawl_cycle_node[0].nodeName+'='+crawl_cycle
                seed_nodes = section_node.getElementsByTagName('seed')
                for seed_node in seed_nodes:
                    seed = {}
                    seed['crawl_cycle'] = crawl_cycle
                    seed['system_id'] = int(system_id)
                    seed['system_name'] = system_name
                    seed['section_id'] = int(section_id)
                    seed['section_name'] = section_name
                    seed_id = seed_node.getAttribute('id')
                    seed['seed_id'] = int(seed_id)
                    #print '  '+seed_node.nodeName+' '+'id='+seed_id
                    userblog_url_node = seed_node.getElementsByTagName('userblog_url')
                    userblog_url = userblog_url_node[0].childNodes[0].nodeValue
                    seed['userblog_url'] = userblog_url
                    #print '   '+'userblog_url'+' '+userblog_url
                    print '-------------------------------------------'
                    print 'system_id:%d' % seed['system_id']
                    print 'system_name:%s'%seed['system_name']
                    print ' section_id:%d' % seed['section_id']
                    print ' section_name:%s' % seed['section_name']
                    print '  seed_id:%d' %seed['seed_id']
                    print '  userblog_url:%s' %seed['userblog_url']
                    print '========================='
                    seed_list.append(seed)
                    print seed_list[k]
                    k += 1
                    os.system('pause')
    return seed_list

    复制代码 代码如下:

    <?xml version="1.0" encoding="utf-8" ?>
    <seeds>
     <system name="新浪">
      <section name="娱乐">
       <crawl_cycle> </crawl_cycle>
       <seed >
        <userblog_url>http://aaa.com.cn/loveissuuny</userblog_url>
       </seed>
       <seed >
        <userblog_url>http://aaa.com.cn/loveissuuny</userblog_url>
       </seed>
       <seed >
        <userblog_url>http://aaa.com.cn/sanxiazaixian</userblog_url>
       </seed>
      </section>
      <section name="读书">
       <crawl_cycle> </crawl_cycle>
       <seed >
        <userblog_url>http://aaa.com.cn/twocold</userblog_url>
       </seed>
       <seed >
        <userblog_url>http://aaa.com.cn/u/1233526741</userblog_url>
       </seed>
      </section>
     </system>
    </seeds>

    jsjbwy
    下一篇:没有了