解析html中的table和表头

2021-08-22 19:31:07 阅读：209 来源： 互联网

标签：output self 表头 cell html ._ table row

原始数据如下：

解析后的数据如下：

补充代码

import pandas as pd
import lxml
from lxml import etree
from bs4 import BeautifulSoup
url = "http://www.stats.gov.cn/tjsj/zxfb/202108/t20210816_1820554.html"
urlhtml=requests.get(url)
urlhtml.encoding='utf-8'
html = etree.HTML(urlhtml.text)  
#html=etree.HTML(urlhtml) #初始化生成一个XPath解析对象
#result=etree.tostring(html,encoding='utf-8')   #解析对象输出代码
result=html.xpath('//table[@class="MsoNormalTable"]')
elements = html.xpath('.//table[@class="MsoNormalTable"]')
for ele in elements:
    header = ele.getparent().getprevious().getprevious()
    print(header.xpath('string(.)'))
    docs = etree.tostring(ele,encoding='utf-8').decode('utf-8')
    extractor = Extractor(docs)
    extractor.parse()
    data = extractor.return_list()
    #print(pd.DataFrame(data))

#Extractor
#!/usr/bin/python
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup, Tag
import os
import csv
import pdb

class Extractor(object):
    def __init__(self, input, id_=None, **kwargs):
        # TODO: should divide this class into two subclasses
        # to deal with string and bs4.Tag separately

        # validate the input
        if not isinstance(input, str) and not isinstance(input, Tag):
            raise Exception('Unrecognized type. Valid input: str, bs4.element.Tag')

        soup = BeautifulSoup(input, 'html.parser').find() if isinstance(input, str) else input

        # locate the target table
        if soup.name == 'table':
            self._table = soup
        else:
            self._table = soup.find(id=id_)

        if 'transformer' in kwargs:
            self._transformer = kwargs['transformer']
        else:
            self._transformer = str

        self._output = []

    def parse(self):
        self._output = []
        row_ind = 0
        col_ind = 0
        for row in self._table.find_all('tr'):
            # record the smallest row_span, so that we know how many rows
            # we should skip
            smallest_row_span = 1

            for cell in row.children:
                if cell.name in ('td', 'th'):
                    # check multiple rows
                    # pdb.set_trace()
                    row_span = int(cell.get('rowspan')) if cell.get('rowspan') else 1

                    # try updating smallest_row_span
                    smallest_row_span = min(smallest_row_span, row_span)

                    # check multiple columns
                    col_span = int(cell.get('colspan')) if cell.get('colspan') else 1

                    # find the right index
                    while True:
                        if self._check_cell_validity(row_ind, col_ind):
                            break
                        col_ind += 1

                    # insert into self._output
                    try:
                        self._insert(row_ind, col_ind, row_span, col_span, self._transformer(cell.get_text()))
                    except UnicodeEncodeError:
                        raise Exception( 'Failed to decode text; you might want to specify kwargs transformer=unicode' )

                    # update col_ind
                    col_ind += col_span

            # update row_ind
            row_ind += smallest_row_span
            col_ind = 0
        return self

    def return_list(self):
        return self._output

    def write_to_csv(self, path='.', filename='output.csv'):
        with open(os.path.join(path, filename), 'w') as csv_file:
            table_writer = csv.writer(csv_file)
            for row in self._output:
                table_writer.writerow(row)
        return

    def _check_validity(self, i, j, height, width):
        """
        check if a rectangle (i, j, height, width) can be put into self.output
        """
        return all(self._check_cell_validity(ii, jj) for ii in range(i, i+height) for jj in range(j, j+width))

    def _check_cell_validity(self, i, j):
        """
        check if a cell (i, j) can be put into self._output
        """
        if i >= len(self._output):
            return True
        if j >= len(self._output[i]):
            return True
        if self._output[i][j] is None:
            return True
        return False

    def _insert(self, i, j, height, width, val):
        # pdb.set_trace()
        for ii in range(i, i+height):
            for jj in range(j, j+width):
                self._insert_cell(ii, jj, val)

    def _insert_cell(self, i, j, val):
        while i >= len(self._output):
            self._output.append([])
        while j >= len(self._output[i]):
            self._output[i].append(None)

        if self._output[i][j] is None:
            self._output[i][j] = val

2021年7月份70个大中城市商品住宅销售价格变动情况

参考

(6条消息) lxml模块常用方法整理总结_彭世瑜的博客-CSDN博客
 lxml/Python : get previous-sibling - Stack Overflow
python3解析库lxml - Py.qi - 博客园
 yuanxu-li/html-table-extractor: extract data from html table

标签：output,self,表头,cell,html,._,table,row
来源： https://www.cnblogs.com/ministep/p/15173250.html

本站声明： 1. iCode9 技术分享网（下文简称本站）提供的所有内容，仅供技术学习、探讨和分享；
2. 关于本站的所有留言、评论、转载及引用，纯属内容发起人的个人观点，与本站观点和立场无关；
3. 关于本站的所有言论和文字，纯属内容发起人的个人观点，与本站观点和立场无关；
4. 本站文章均是网友提供，不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属；如您发现该文章侵犯了您的权益，可联系我们第一时间进行删除；
5. 本站为非盈利性的个人网站，所有内容不会用来进行牟利，也不会利用任何形式的广告来间接获益，纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

ICode9

解析html中的table和表头

参考