Python-BeautifulSoup-HTML_Memory error when import multiple HTML files

.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty,.everyoneloves__bot-mid-leaderboard:empty{ height:90px;width:728px;box-sizing:border-box;
}

I am programming a python 2.7 script to import multiple HTML local files (each data files ~ 3MB).
I use the BeautifulSoup4 package but I meet the memory error when I import more than 15 HTML files.
My code is:

from bs4 import BeautifulSoup

import os

import xlwt

# ============= FIND THE FIRST HTML FILE ======= #



directory ='C:UserskhoiptDesktopdata_extractionCalib_SMP_100HTMLHTML1'

flag = 0

for filename in os.listdir(directory):

    if filename.endswith('.HTML'):

        fname_1 = os.path.join(directory,filename)

        print('Filename: {}'.format(fname_1))

        flag = 1

        break



if flag == 0:

    print('There is no HTML file')



fname_1_open = open(fname_1)

soup = BeautifulSoup(fname_1_open, "html.parser")

fname_1_open.close()

tables = soup.find_all("table")



# ================================================================

# =========  DECLARE EXCEL FILE ==================================

# Code for create a excel file

# ================================================================



# =========== FIND THE NUMBER OF HTML FILE ========

# Find the number of HTML files and print it to cnt_total_html

# print('The number of HTML files is: ', cnt_total_html)

# =================================================



# ======  Find the number of titles in HTML file =================

# Find the number of titles in HTML file and print it to counter_title

# Each title can contain many tables

# ================================================================



# ====  FIND THE NUMBER OF TABLES IN HTML FILE =======================

# Find the number of tables in HTML file and print it to counter_table

# ====================================================================



# =================================================================

print('Please select the desired title: ')

num = input()

print('The desired title is: ', table_name[num])



print('Please select the order of table: ')

num_o = input()

print('The desired order of table is: ', num_o)



print('Please select the order of line: ')

num_line = input()

print('The desired order of line is: ', num_line)

# ====================================================================



# ================ FIND AND PRINT THE DESIRED LINE TO EXCEL FILE ===========



cnt_total_html_all = cnt_total_html #cnt_total_html is the total html files 

Matrix_table = [[0 for x in range(counter_table)] for y in range(cnt_total_html)] 





line_no = [None]*counter_title

i = 0

j = 0

k = 0

m = 0

n = 0

flag_cnt_html = 1



x = 2

y = 2

flag_table = 0

flag_line = 0

flag_header = 0

cnt_line = 0



# Seperate the whole HTML files into 10-files block

while cnt_total_html > 10: 

    cnt_total_html = cnt_total_html - 10

    flag_cnt_html += 1

print('So flag_cnt_html la: ',flag_cnt_html)

###################################################



for i in range(flag_cnt_html):

    if (cnt_total_html_all - i*10) > 10:

        TABLE_sheet=worksheet['TABLE']

        for k in range(0+i*10,9+i*10):

            f = open(fopen[k])

            soup = BeautifulSoup(f, "html.parser")

            Matrix_table[k] = soup.find_all("table") #Find all table in the HTML file

            f.close()



            table_no = [None]*counter_table

            table_no = 

            m = 0

            f = open(fopen[k])

            line_num = 0

            lines = f.readlines()

            cnt = 0

            for line in lines:

                line_num = 1 + line_num

                if line.find("table") >= 0:

                    table_no.append(line_num) #find line number of each table

                    cnt += 1

                    m += 1

            f.close

            start_no = Matrix[k][num] #find line number of the desired title

            stop_no = Matrix[k][num+1] #find line number of the sibling desired title



            m = 0

            flag_table = 0

            cnt_line = 0

            for m in range(cnt):

                if table_no[m] in range(start_no,stop_no):

                    i1 = (m - 1)/2

                    mytable = Matrix_table[k][i1]

                    flag_table += 1

                    if flag_table == num_o*2: #Check if the order of table in HTML file is the desired table

                        if flag_header == 0:

                            rows = mytable.findChildren(['th','tr'])

                            cells = rows[0].findChildren('td')

                            y = 2

                            for cell in cells:

                                TABLE_sheet.cell(row=x, column=y).value = cell.get_text()

                                worksheet.save(filepath)                

                                y = y + 1

                            x = x + 1

                            flag_header = 1

                        rows = mytable.findChildren(['th','tr'])

                        for row in rows[1:]:

                            cells = row.findChildren('td')

                            y = 2

                            cnt_line += 1

                            if cnt_line == num_line: #Check if the order of line in the table is the desired one

                                for cell in cells:

                                    TABLE_sheet.cell(row=2,column=1).value = 'HTML file name' #Write to excel file

                                    TABLE_sheet.cell(row=x,column=1).value = fname_1[k]

                                    TABLE_sheet.cell(row=x, column=y).value = cell.get_text()

                                    worksheet.save(filepath)

                                    y = y + 1

                                x = x + 1

            worksheet.save(filepath)

            soup.decompose()

            gc.collect()

    else:

        gc.collect()

        TABLE_sheet=worksheet['TABLE']

        for k in range(0+i*10-1, cnt_total_html_all):

            f2 = open(fopen[k])

            soup2 = BeautifulSoup(f2, "html.parser")

            Matrix_table[k] = soup2.find_all("table")

            f2.close()

            table_no = 

            m = 0

            line_num = 0

            f2 = open(fopen[k])

            lines = f2.readlines()

            cnt = 0

            for line in lines:

                line_num = 1 + line_num

                if line.find("table") >= 0:

                    table_no.append(line_num)

                    cnt += 1

                    m += 1

            f2.close()

            start_no = Matrix[k][num]

            stop_no = Matrix[k][num+1]

            m = 0

            flag_table = 0

            cnt_line = 0

            for m in range(cnt):

                if table_no[m] in range(start_no,stop_no):

                    i1 = (m-1)/2

                    mytable = Matrix_table[k][i1]

                    flag_table += 1

                    if flag_table == num_o*2:

                        if flag_header == 0:

                            rows = mytable.findChildren(['th','tr'])

                            cells = rows[0].findChildren('td')

                            y = 2

                            for cell in cells:

                                TABLE_sheet.cell(row=x, column=y).value = cell.get_text()

                                worksheet.save(filepath)            

                                y = y + 1

                            x = x + 1

                            flag_header = 1

                        rows = mytable.findChildren(['th','tr'])

                        for row in rows[1:]:

                            cells = row.findChildren('td')

                            y = 2

                            cnt_line += 1

                            if cnt_line == num_line:

                                for cell in cells:

                                    TABLE_sheet.cell(row=1,column=1).value = table_name[num]

                                    TABLE_sheet.cell(row=2,column=1).value = 'HTML file name'

                                    TABLE_sheet.cell(row=x,column=1).value = fname_1[k]

                                    TABLE_sheet.cell(row=x, column=y).value = cell.get_text()

                                    worksheet.save(filepath)

                                    y = y + 1

                                x = x + 1

            worksheet.save(filepath)

            soup2.decompose()

            gc.collect()

# ================================================================

My HTML file is exported from the equipment and it does not look like traditional HTML files. It contains only table descriptions. Unfortunately, I cannot modify the structure of the exported HTML file.

</table>

</UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt">

          Duration: 0.02 (s)

          <br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><a href="#TOC" xmlns:msxsl="urn:schemas-microsoft-com:xslt">Back to top

          </a><a name="22" xmlns:msxsl="urn:schemas-microsoft-com:xslt"></a><HR COLOR="#6699CC" SIZE="2" xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big><b>LTE_B1_ESC_APT</b></big></big><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big><b>Status:  </b></big><big><b><span style="color:#00CC00">PASS</span></b></big></UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">Run Info</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">

<table border="1">

<tr>

<td bgcolor="#FFFFCC" align="center"><b>Switchpoint Data</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Temp (C)</b></td>

<td bgcolor="#FFFFCC" align="center"><b>RFMode</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Technology</b></td>

<td bgcolor="#FFFFCC" align="center"><b>RF Mode</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Band</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Device</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Instance</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Calibration Mode</b></td>

<td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>

</tr>

<tr>

<td align="center">Streaming not supported</td>

<td align="center">22</td>

<td align="center">34</td>

<td align="center">LTE</td>

<td align="center">34</td>

<td align="center">1</td>

<td align="center">0</td>

<td align="center">0</td>

<td align="center">Composite</td>

<td align="center">2.17</td>

</tr>

</table>

</UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">Segment Lengths</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">

<table border="1">

<tr>

<td bgcolor="#FFFFCC" align="center"><b>Tx Lin Tx Seg Len (ms)</b></td>

<td bgcolor="#FFFFCC" align="center"><b>TxRx Seg Len (ms)</b></td>

<td bgcolor="#FFFFCC" align="center"><b>RF Config Seg Len (ms)</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Tuning Seg Len (ms)</b></td>

<td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>

</tr>

<tr>

<td align="center">4</td>

<td align="center">4</td>

<td align="center">60</td>

<td align="center">20</td>

<td><span style="color:white">-</span></td>

</tr>

</table>

</UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">MEASUREMENTS: Tx Linearizer Sweep</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">

<table border="1">

<tr>

<td bgcolor="#FFFFCC" align="center"><b>Channel</b></td>

<td bgcolor="#FFFFCC" align="center"><b>PA State</b></td>

<td bgcolor="#FFFFCC" align="center"><b>TxAGC</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Vcc</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Icq</b></td>

<td bgcolor="#6699CC" align="center"><b>Power</b></td>

<td bgcolor="#6699CC" align="center"><b>HDET</b></td>

<td bgcolor="#6699CC" align="center"><b>LPM HDET</b></td>

<td bgcolor="#6699CC" align="center"><b>DeltaPwr</b></td>

<td bgcolor="#CCCCCC" align="center"><b>DeltaPwr Min</b></td>

<td bgcolor="#CCCCCC" align="center"><b>DeltaPwr Max</b></td>

<td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>

</tr>

<tr>

<td align="center">18300</td>

<td align="center">1</td>

<td align="center">68</td>

<td align="center">3000</td>

<td align="center">235</td>

<td align="center">26.7</td>

<td align="center">20180</td>

<td align="center">-</td>

<td align="center">0</td>

<td align="center">0</td>

<td align="center">0</td>

<td><span style="color:white">-</span></td>

</tr>

<tr>

<td align="center">18300</td>

<td align="center">1</td>

<td align="center">67</td>

<td align="center">3000</td>

<td align="center">235</td>

<td align="center">26.3</td>

<td align="center">18072</td>

<td align="center">-</td>

<td align="center">0.4</td>

<td align="center">-1</td>

<td align="center">8</td>

<td><span style="color:white">-</span></td>

</tr>

The error when I process more than 15 HTML files:

Exception MemoryError: MemoryError() in <generator object prepare_markup at 0x7CC15A80> ignored

Traceback (most recent call last):

  File ".html_parsing_Nov_01_max15_1_submit_to_SOF.py", line 297, in <module>

    soup2 = BeautifulSoup(f2, "html.parser")

  File "C:Python27libsite-packagesbeautifulsoup4-4.6.3-py2.7.eggbs4__init__.py", line 282, in __init__

    self._feed()

  File "C:Python27libsite-packagesbeautifulsoup4-4.6.3-py2.7.eggbs4__init__.py", line 343, in _feed

    self.builder.feed(self.markup)

MemoryError

I have tested that when I have just parsed the HTML file using BS4 without find_all table, the code can process more than 15 files. I also try to use the BS4 only for the first file and then find the line number of the desired contain, however, I can not find the line number of those lines using BS4.

Would you please take a look and give me your comments as possible?
Thank you very much.

asked Jan 3 at 2:53

khoipt

add a comment |

from bs4 import BeautifulSoup

import os

import xlwt

# ============= FIND THE FIRST HTML FILE ======= #



directory ='C:UserskhoiptDesktopdata_extractionCalib_SMP_100HTMLHTML1'

flag = 0

for filename in os.listdir(directory):

    if filename.endswith('.HTML'):

        fname_1 = os.path.join(directory,filename)

        print('Filename: {}'.format(fname_1))

        flag = 1

        break



if flag == 0:

    print('There is no HTML file')



fname_1_open = open(fname_1)

soup = BeautifulSoup(fname_1_open, "html.parser")

fname_1_open.close()

tables = soup.find_all("table")



# ================================================================

# =========  DECLARE EXCEL FILE ==================================

# Code for create a excel file

# ================================================================



# =========== FIND THE NUMBER OF HTML FILE ========

# Find the number of HTML files and print it to cnt_total_html

# print('The number of HTML files is: ', cnt_total_html)

# =================================================



# ======  Find the number of titles in HTML file =================

# Find the number of titles in HTML file and print it to counter_title

# Each title can contain many tables

# ================================================================



# ====  FIND THE NUMBER OF TABLES IN HTML FILE =======================

# Find the number of tables in HTML file and print it to counter_table

# ====================================================================



# =================================================================

print('Please select the desired title: ')

num = input()

print('The desired title is: ', table_name[num])



print('Please select the order of table: ')

num_o = input()

print('The desired order of table is: ', num_o)



print('Please select the order of line: ')

num_line = input()

print('The desired order of line is: ', num_line)

# ====================================================================



# ================ FIND AND PRINT THE DESIRED LINE TO EXCEL FILE ===========



cnt_total_html_all = cnt_total_html #cnt_total_html is the total html files 

Matrix_table = [[0 for x in range(counter_table)] for y in range(cnt_total_html)] 





line_no = [None]*counter_title

i = 0

j = 0

k = 0

m = 0

n = 0

flag_cnt_html = 1



x = 2

y = 2

flag_table = 0

flag_line = 0

flag_header = 0

cnt_line = 0



# Seperate the whole HTML files into 10-files block

while cnt_total_html > 10: 

    cnt_total_html = cnt_total_html - 10

    flag_cnt_html += 1

print('So flag_cnt_html la: ',flag_cnt_html)

###################################################



for i in range(flag_cnt_html):

    if (cnt_total_html_all - i*10) > 10:

        TABLE_sheet=worksheet['TABLE']

        for k in range(0+i*10,9+i*10):

            f = open(fopen[k])

            soup = BeautifulSoup(f, "html.parser")

            Matrix_table[k] = soup.find_all("table") #Find all table in the HTML file

            f.close()



            table_no = [None]*counter_table

            table_no = 

            m = 0

            f = open(fopen[k])

            line_num = 0

            lines = f.readlines()

            cnt = 0

            for line in lines:

                line_num = 1 + line_num

                if line.find("table") >= 0:

                    table_no.append(line_num) #find line number of each table

                    cnt += 1

                    m += 1

            f.close

            start_no = Matrix[k][num] #find line number of the desired title

            stop_no = Matrix[k][num+1] #find line number of the sibling desired title



            m = 0

            flag_table = 0

            cnt_line = 0

            for m in range(cnt):

                if table_no[m] in range(start_no,stop_no):

                    i1 = (m - 1)/2

                    mytable = Matrix_table[k][i1]

                    flag_table += 1

                    if flag_table == num_o*2: #Check if the order of table in HTML file is the desired table

                        if flag_header == 0:

                            rows = mytable.findChildren(['th','tr'])

                            cells = rows[0].findChildren('td')

                            y = 2

                            for cell in cells:

                                TABLE_sheet.cell(row=x, column=y).value = cell.get_text()

                                worksheet.save(filepath)                

                                y = y + 1

                            x = x + 1

                            flag_header = 1

                        rows = mytable.findChildren(['th','tr'])

                        for row in rows[1:]:

                            cells = row.findChildren('td')

                            y = 2

                            cnt_line += 1

                            if cnt_line == num_line: #Check if the order of line in the table is the desired one

                                for cell in cells:

                                    TABLE_sheet.cell(row=2,column=1).value = 'HTML file name' #Write to excel file

                                    TABLE_sheet.cell(row=x,column=1).value = fname_1[k]

                                    TABLE_sheet.cell(row=x, column=y).value = cell.get_text()

                                    worksheet.save(filepath)

                                    y = y + 1

                                x = x + 1

            worksheet.save(filepath)

            soup.decompose()

            gc.collect()

    else:

        gc.collect()

        TABLE_sheet=worksheet['TABLE']

        for k in range(0+i*10-1, cnt_total_html_all):

            f2 = open(fopen[k])

            soup2 = BeautifulSoup(f2, "html.parser")

            Matrix_table[k] = soup2.find_all("table")

            f2.close()

            table_no = 

            m = 0

            line_num = 0

            f2 = open(fopen[k])

            lines = f2.readlines()

            cnt = 0

            for line in lines:

                line_num = 1 + line_num

                if line.find("table") >= 0:

                    table_no.append(line_num)

                    cnt += 1

                    m += 1

            f2.close()

            start_no = Matrix[k][num]

            stop_no = Matrix[k][num+1]

            m = 0

            flag_table = 0

            cnt_line = 0

            for m in range(cnt):

                if table_no[m] in range(start_no,stop_no):

                    i1 = (m-1)/2

                    mytable = Matrix_table[k][i1]

                    flag_table += 1

                    if flag_table == num_o*2:

                        if flag_header == 0:

                            rows = mytable.findChildren(['th','tr'])

                            cells = rows[0].findChildren('td')

                            y = 2

                            for cell in cells:

                                TABLE_sheet.cell(row=x, column=y).value = cell.get_text()

                                worksheet.save(filepath)            

                                y = y + 1

                            x = x + 1

                            flag_header = 1

                        rows = mytable.findChildren(['th','tr'])

                        for row in rows[1:]:

                            cells = row.findChildren('td')

                            y = 2

                            cnt_line += 1

                            if cnt_line == num_line:

                                for cell in cells:

                                    TABLE_sheet.cell(row=1,column=1).value = table_name[num]

                                    TABLE_sheet.cell(row=2,column=1).value = 'HTML file name'

                                    TABLE_sheet.cell(row=x,column=1).value = fname_1[k]

                                    TABLE_sheet.cell(row=x, column=y).value = cell.get_text()

                                    worksheet.save(filepath)

                                    y = y + 1

                                x = x + 1

            worksheet.save(filepath)

            soup2.decompose()

            gc.collect()

# ================================================================

</table>

</UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt">

          Duration: 0.02 (s)

          <br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><a href="#TOC" xmlns:msxsl="urn:schemas-microsoft-com:xslt">Back to top

          </a><a name="22" xmlns:msxsl="urn:schemas-microsoft-com:xslt"></a><HR COLOR="#6699CC" SIZE="2" xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big><b>LTE_B1_ESC_APT</b></big></big><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big><b>Status:  </b></big><big><b><span style="color:#00CC00">PASS</span></b></big></UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">Run Info</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">

<table border="1">

<tr>

<td bgcolor="#FFFFCC" align="center"><b>Switchpoint Data</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Temp (C)</b></td>

<td bgcolor="#FFFFCC" align="center"><b>RFMode</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Technology</b></td>

<td bgcolor="#FFFFCC" align="center"><b>RF Mode</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Band</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Device</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Instance</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Calibration Mode</b></td>

<td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>

</tr>

<tr>

<td align="center">Streaming not supported</td>

<td align="center">22</td>

<td align="center">34</td>

<td align="center">LTE</td>

<td align="center">34</td>

<td align="center">1</td>

<td align="center">0</td>

<td align="center">0</td>

<td align="center">Composite</td>

<td align="center">2.17</td>

</tr>

</table>

</UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">Segment Lengths</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">

<table border="1">

<tr>

<td bgcolor="#FFFFCC" align="center"><b>Tx Lin Tx Seg Len (ms)</b></td>

<td bgcolor="#FFFFCC" align="center"><b>TxRx Seg Len (ms)</b></td>

<td bgcolor="#FFFFCC" align="center"><b>RF Config Seg Len (ms)</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Tuning Seg Len (ms)</b></td>

<td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>

</tr>

<tr>

<td align="center">4</td>

<td align="center">4</td>

<td align="center">60</td>

<td align="center">20</td>

<td><span style="color:white">-</span></td>

</tr>

</table>

</UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">MEASUREMENTS: Tx Linearizer Sweep</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">

<table border="1">

<tr>

<td bgcolor="#FFFFCC" align="center"><b>Channel</b></td>

<td bgcolor="#FFFFCC" align="center"><b>PA State</b></td>

<td bgcolor="#FFFFCC" align="center"><b>TxAGC</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Vcc</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Icq</b></td>

<td bgcolor="#6699CC" align="center"><b>Power</b></td>

<td bgcolor="#6699CC" align="center"><b>HDET</b></td>

<td bgcolor="#6699CC" align="center"><b>LPM HDET</b></td>

<td bgcolor="#6699CC" align="center"><b>DeltaPwr</b></td>

<td bgcolor="#CCCCCC" align="center"><b>DeltaPwr Min</b></td>

<td bgcolor="#CCCCCC" align="center"><b>DeltaPwr Max</b></td>

<td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>

</tr>

<tr>

<td align="center">18300</td>

<td align="center">1</td>

<td align="center">68</td>

<td align="center">3000</td>

<td align="center">235</td>

<td align="center">26.7</td>

<td align="center">20180</td>

<td align="center">-</td>

<td align="center">0</td>

<td align="center">0</td>

<td align="center">0</td>

<td><span style="color:white">-</span></td>

</tr>

<tr>

<td align="center">18300</td>

<td align="center">1</td>

<td align="center">67</td>

<td align="center">3000</td>

<td align="center">235</td>

<td align="center">26.3</td>

<td align="center">18072</td>

<td align="center">-</td>

<td align="center">0.4</td>

<td align="center">-1</td>

<td align="center">8</td>

<td><span style="color:white">-</span></td>

</tr>

The error when I process more than 15 HTML files:

Exception MemoryError: MemoryError() in <generator object prepare_markup at 0x7CC15A80> ignored

Traceback (most recent call last):

  File ".html_parsing_Nov_01_max15_1_submit_to_SOF.py", line 297, in <module>

    soup2 = BeautifulSoup(f2, "html.parser")

  File "C:Python27libsite-packagesbeautifulsoup4-4.6.3-py2.7.eggbs4__init__.py", line 282, in __init__

    self._feed()

  File "C:Python27libsite-packagesbeautifulsoup4-4.6.3-py2.7.eggbs4__init__.py", line 343, in _feed

    self.builder.feed(self.markup)

MemoryError

Would you please take a look and give me your comments as possible?
Thank you very much.

asked Jan 3 at 2:53

khoipt

add a comment |

from bs4 import BeautifulSoup

import os

import xlwt

# ============= FIND THE FIRST HTML FILE ======= #



directory ='C:UserskhoiptDesktopdata_extractionCalib_SMP_100HTMLHTML1'

flag = 0

for filename in os.listdir(directory):

    if filename.endswith('.HTML'):

        fname_1 = os.path.join(directory,filename)

        print('Filename: {}'.format(fname_1))

        flag = 1

        break



if flag == 0:

    print('There is no HTML file')



fname_1_open = open(fname_1)

soup = BeautifulSoup(fname_1_open, "html.parser")

fname_1_open.close()

tables = soup.find_all("table")



# ================================================================

# =========  DECLARE EXCEL FILE ==================================

# Code for create a excel file

# ================================================================



# =========== FIND THE NUMBER OF HTML FILE ========

# Find the number of HTML files and print it to cnt_total_html

# print('The number of HTML files is: ', cnt_total_html)

# =================================================



# ======  Find the number of titles in HTML file =================

# Find the number of titles in HTML file and print it to counter_title

# Each title can contain many tables

# ================================================================



# ====  FIND THE NUMBER OF TABLES IN HTML FILE =======================

# Find the number of tables in HTML file and print it to counter_table

# ====================================================================



# =================================================================

print('Please select the desired title: ')

num = input()

print('The desired title is: ', table_name[num])



print('Please select the order of table: ')

num_o = input()

print('The desired order of table is: ', num_o)



print('Please select the order of line: ')

num_line = input()

print('The desired order of line is: ', num_line)

# ====================================================================



# ================ FIND AND PRINT THE DESIRED LINE TO EXCEL FILE ===========



cnt_total_html_all = cnt_total_html #cnt_total_html is the total html files 

Matrix_table = [[0 for x in range(counter_table)] for y in range(cnt_total_html)] 





line_no = [None]*counter_title

i = 0

j = 0

k = 0

m = 0

n = 0

flag_cnt_html = 1



x = 2

y = 2

flag_table = 0

flag_line = 0

flag_header = 0

cnt_line = 0



# Seperate the whole HTML files into 10-files block

while cnt_total_html > 10: 

    cnt_total_html = cnt_total_html - 10

    flag_cnt_html += 1

print('So flag_cnt_html la: ',flag_cnt_html)

###################################################



for i in range(flag_cnt_html):

    if (cnt_total_html_all - i*10) > 10:

        TABLE_sheet=worksheet['TABLE']

        for k in range(0+i*10,9+i*10):

            f = open(fopen[k])

            soup = BeautifulSoup(f, "html.parser")

            Matrix_table[k] = soup.find_all("table") #Find all table in the HTML file

            f.close()



            table_no = [None]*counter_table

            table_no = 

            m = 0

            f = open(fopen[k])

            line_num = 0

            lines = f.readlines()

            cnt = 0

            for line in lines:

                line_num = 1 + line_num

                if line.find("table") >= 0:

                    table_no.append(line_num) #find line number of each table

                    cnt += 1

                    m += 1

            f.close

            start_no = Matrix[k][num] #find line number of the desired title

            stop_no = Matrix[k][num+1] #find line number of the sibling desired title



            m = 0

            flag_table = 0

            cnt_line = 0

            for m in range(cnt):

                if table_no[m] in range(start_no,stop_no):

                    i1 = (m - 1)/2

                    mytable = Matrix_table[k][i1]

                    flag_table += 1

                    if flag_table == num_o*2: #Check if the order of table in HTML file is the desired table

                        if flag_header == 0:

                            rows = mytable.findChildren(['th','tr'])

                            cells = rows[0].findChildren('td')

                            y = 2

                            for cell in cells:

                                TABLE_sheet.cell(row=x, column=y).value = cell.get_text()

                                worksheet.save(filepath)                

                                y = y + 1

                            x = x + 1

                            flag_header = 1

                        rows = mytable.findChildren(['th','tr'])

                        for row in rows[1:]:

                            cells = row.findChildren('td')

                            y = 2

                            cnt_line += 1

                            if cnt_line == num_line: #Check if the order of line in the table is the desired one

                                for cell in cells:

                                    TABLE_sheet.cell(row=2,column=1).value = 'HTML file name' #Write to excel file

                                    TABLE_sheet.cell(row=x,column=1).value = fname_1[k]

                                    TABLE_sheet.cell(row=x, column=y).value = cell.get_text()

                                    worksheet.save(filepath)

                                    y = y + 1

                                x = x + 1

            worksheet.save(filepath)

            soup.decompose()

            gc.collect()

    else:

        gc.collect()

        TABLE_sheet=worksheet['TABLE']

        for k in range(0+i*10-1, cnt_total_html_all):

            f2 = open(fopen[k])

            soup2 = BeautifulSoup(f2, "html.parser")

            Matrix_table[k] = soup2.find_all("table")

            f2.close()

            table_no = 

            m = 0

            line_num = 0

            f2 = open(fopen[k])

            lines = f2.readlines()

            cnt = 0

            for line in lines:

                line_num = 1 + line_num

                if line.find("table") >= 0:

                    table_no.append(line_num)

                    cnt += 1

                    m += 1

            f2.close()

            start_no = Matrix[k][num]

            stop_no = Matrix[k][num+1]

            m = 0

            flag_table = 0

            cnt_line = 0

            for m in range(cnt):

                if table_no[m] in range(start_no,stop_no):

                    i1 = (m-1)/2

                    mytable = Matrix_table[k][i1]

                    flag_table += 1

                    if flag_table == num_o*2:

                        if flag_header == 0:

                            rows = mytable.findChildren(['th','tr'])

                            cells = rows[0].findChildren('td')

                            y = 2

                            for cell in cells:

                                TABLE_sheet.cell(row=x, column=y).value = cell.get_text()

                                worksheet.save(filepath)            

                                y = y + 1

                            x = x + 1

                            flag_header = 1

                        rows = mytable.findChildren(['th','tr'])

                        for row in rows[1:]:

                            cells = row.findChildren('td')

                            y = 2

                            cnt_line += 1

                            if cnt_line == num_line:

                                for cell in cells:

                                    TABLE_sheet.cell(row=1,column=1).value = table_name[num]

                                    TABLE_sheet.cell(row=2,column=1).value = 'HTML file name'

                                    TABLE_sheet.cell(row=x,column=1).value = fname_1[k]

                                    TABLE_sheet.cell(row=x, column=y).value = cell.get_text()

                                    worksheet.save(filepath)

                                    y = y + 1

                                x = x + 1

            worksheet.save(filepath)

            soup2.decompose()

            gc.collect()

# ================================================================

</table>

</UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt">

          Duration: 0.02 (s)

          <br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><a href="#TOC" xmlns:msxsl="urn:schemas-microsoft-com:xslt">Back to top

          </a><a name="22" xmlns:msxsl="urn:schemas-microsoft-com:xslt"></a><HR COLOR="#6699CC" SIZE="2" xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big><b>LTE_B1_ESC_APT</b></big></big><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big><b>Status:  </b></big><big><b><span style="color:#00CC00">PASS</span></b></big></UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">Run Info</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">

<table border="1">

<tr>

<td bgcolor="#FFFFCC" align="center"><b>Switchpoint Data</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Temp (C)</b></td>

<td bgcolor="#FFFFCC" align="center"><b>RFMode</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Technology</b></td>

<td bgcolor="#FFFFCC" align="center"><b>RF Mode</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Band</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Device</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Instance</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Calibration Mode</b></td>

<td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>

</tr>

<tr>

<td align="center">Streaming not supported</td>

<td align="center">22</td>

<td align="center">34</td>

<td align="center">LTE</td>

<td align="center">34</td>

<td align="center">1</td>

<td align="center">0</td>

<td align="center">0</td>

<td align="center">Composite</td>

<td align="center">2.17</td>

</tr>

</table>

</UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">Segment Lengths</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">

<table border="1">

<tr>

<td bgcolor="#FFFFCC" align="center"><b>Tx Lin Tx Seg Len (ms)</b></td>

<td bgcolor="#FFFFCC" align="center"><b>TxRx Seg Len (ms)</b></td>

<td bgcolor="#FFFFCC" align="center"><b>RF Config Seg Len (ms)</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Tuning Seg Len (ms)</b></td>

<td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>

</tr>

<tr>

<td align="center">4</td>

<td align="center">4</td>

<td align="center">60</td>

<td align="center">20</td>

<td><span style="color:white">-</span></td>

</tr>

</table>

</UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">MEASUREMENTS: Tx Linearizer Sweep</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">

<table border="1">

<tr>

<td bgcolor="#FFFFCC" align="center"><b>Channel</b></td>

<td bgcolor="#FFFFCC" align="center"><b>PA State</b></td>

<td bgcolor="#FFFFCC" align="center"><b>TxAGC</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Vcc</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Icq</b></td>

<td bgcolor="#6699CC" align="center"><b>Power</b></td>

<td bgcolor="#6699CC" align="center"><b>HDET</b></td>

<td bgcolor="#6699CC" align="center"><b>LPM HDET</b></td>

<td bgcolor="#6699CC" align="center"><b>DeltaPwr</b></td>

<td bgcolor="#CCCCCC" align="center"><b>DeltaPwr Min</b></td>

<td bgcolor="#CCCCCC" align="center"><b>DeltaPwr Max</b></td>

<td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>

</tr>

<tr>

<td align="center">18300</td>

<td align="center">1</td>

<td align="center">68</td>

<td align="center">3000</td>

<td align="center">235</td>

<td align="center">26.7</td>

<td align="center">20180</td>

<td align="center">-</td>

<td align="center">0</td>

<td align="center">0</td>

<td align="center">0</td>

<td><span style="color:white">-</span></td>

</tr>

<tr>

<td align="center">18300</td>

<td align="center">1</td>

<td align="center">67</td>

<td align="center">3000</td>

<td align="center">235</td>

<td align="center">26.3</td>

<td align="center">18072</td>

<td align="center">-</td>

<td align="center">0.4</td>

<td align="center">-1</td>

<td align="center">8</td>

<td><span style="color:white">-</span></td>

</tr>

The error when I process more than 15 HTML files:

Exception MemoryError: MemoryError() in <generator object prepare_markup at 0x7CC15A80> ignored

Traceback (most recent call last):

  File ".html_parsing_Nov_01_max15_1_submit_to_SOF.py", line 297, in <module>

    soup2 = BeautifulSoup(f2, "html.parser")

  File "C:Python27libsite-packagesbeautifulsoup4-4.6.3-py2.7.eggbs4__init__.py", line 282, in __init__

    self._feed()

  File "C:Python27libsite-packagesbeautifulsoup4-4.6.3-py2.7.eggbs4__init__.py", line 343, in _feed

    self.builder.feed(self.markup)

MemoryError

Would you please take a look and give me your comments as possible?
Thank you very much.

asked Jan 3 at 2:53

khoipt

from bs4 import BeautifulSoup

import os

import xlwt

# ============= FIND THE FIRST HTML FILE ======= #



directory ='C:UserskhoiptDesktopdata_extractionCalib_SMP_100HTMLHTML1'

flag = 0

for filename in os.listdir(directory):

    if filename.endswith('.HTML'):

        fname_1 = os.path.join(directory,filename)

        print('Filename: {}'.format(fname_1))

        flag = 1

        break



if flag == 0:

    print('There is no HTML file')



fname_1_open = open(fname_1)

soup = BeautifulSoup(fname_1_open, "html.parser")

fname_1_open.close()

tables = soup.find_all("table")



# ================================================================

# =========  DECLARE EXCEL FILE ==================================

# Code for create a excel file

# ================================================================



# =========== FIND THE NUMBER OF HTML FILE ========

# Find the number of HTML files and print it to cnt_total_html

# print('The number of HTML files is: ', cnt_total_html)

# =================================================



# ======  Find the number of titles in HTML file =================

# Find the number of titles in HTML file and print it to counter_title

# Each title can contain many tables

# ================================================================



# ====  FIND THE NUMBER OF TABLES IN HTML FILE =======================

# Find the number of tables in HTML file and print it to counter_table

# ====================================================================



# =================================================================

print('Please select the desired title: ')

num = input()

print('The desired title is: ', table_name[num])



print('Please select the order of table: ')

num_o = input()

print('The desired order of table is: ', num_o)



print('Please select the order of line: ')

num_line = input()

print('The desired order of line is: ', num_line)

# ====================================================================



# ================ FIND AND PRINT THE DESIRED LINE TO EXCEL FILE ===========



cnt_total_html_all = cnt_total_html #cnt_total_html is the total html files 

Matrix_table = [[0 for x in range(counter_table)] for y in range(cnt_total_html)] 





line_no = [None]*counter_title

i = 0

j = 0

k = 0

m = 0

n = 0

flag_cnt_html = 1



x = 2

y = 2

flag_table = 0

flag_line = 0

flag_header = 0

cnt_line = 0



# Seperate the whole HTML files into 10-files block

while cnt_total_html > 10: 

    cnt_total_html = cnt_total_html - 10

    flag_cnt_html += 1

print('So flag_cnt_html la: ',flag_cnt_html)

###################################################



for i in range(flag_cnt_html):

    if (cnt_total_html_all - i*10) > 10:

        TABLE_sheet=worksheet['TABLE']

        for k in range(0+i*10,9+i*10):

            f = open(fopen[k])

            soup = BeautifulSoup(f, "html.parser")

            Matrix_table[k] = soup.find_all("table") #Find all table in the HTML file

            f.close()



            table_no = [None]*counter_table

            table_no = 

            m = 0

            f = open(fopen[k])

            line_num = 0

            lines = f.readlines()

            cnt = 0

            for line in lines:

                line_num = 1 + line_num

                if line.find("table") >= 0:

                    table_no.append(line_num) #find line number of each table

                    cnt += 1

                    m += 1

            f.close

            start_no = Matrix[k][num] #find line number of the desired title

            stop_no = Matrix[k][num+1] #find line number of the sibling desired title



            m = 0

            flag_table = 0

            cnt_line = 0

            for m in range(cnt):

                if table_no[m] in range(start_no,stop_no):

                    i1 = (m - 1)/2

                    mytable = Matrix_table[k][i1]

                    flag_table += 1

                    if flag_table == num_o*2: #Check if the order of table in HTML file is the desired table

                        if flag_header == 0:

                            rows = mytable.findChildren(['th','tr'])

                            cells = rows[0].findChildren('td')

                            y = 2

                            for cell in cells:

                                TABLE_sheet.cell(row=x, column=y).value = cell.get_text()

                                worksheet.save(filepath)                

                                y = y + 1

                            x = x + 1

                            flag_header = 1

                        rows = mytable.findChildren(['th','tr'])

                        for row in rows[1:]:

                            cells = row.findChildren('td')

                            y = 2

                            cnt_line += 1

                            if cnt_line == num_line: #Check if the order of line in the table is the desired one

                                for cell in cells:

                                    TABLE_sheet.cell(row=2,column=1).value = 'HTML file name' #Write to excel file

                                    TABLE_sheet.cell(row=x,column=1).value = fname_1[k]

                                    TABLE_sheet.cell(row=x, column=y).value = cell.get_text()

                                    worksheet.save(filepath)

                                    y = y + 1

                                x = x + 1

            worksheet.save(filepath)

            soup.decompose()

            gc.collect()

    else:

        gc.collect()

        TABLE_sheet=worksheet['TABLE']

        for k in range(0+i*10-1, cnt_total_html_all):

            f2 = open(fopen[k])

            soup2 = BeautifulSoup(f2, "html.parser")

            Matrix_table[k] = soup2.find_all("table")

            f2.close()

            table_no = 

            m = 0

            line_num = 0

            f2 = open(fopen[k])

            lines = f2.readlines()

            cnt = 0

            for line in lines:

                line_num = 1 + line_num

                if line.find("table") >= 0:

                    table_no.append(line_num)

                    cnt += 1

                    m += 1

            f2.close()

            start_no = Matrix[k][num]

            stop_no = Matrix[k][num+1]

            m = 0

            flag_table = 0

            cnt_line = 0

            for m in range(cnt):

                if table_no[m] in range(start_no,stop_no):

                    i1 = (m-1)/2

                    mytable = Matrix_table[k][i1]

                    flag_table += 1

                    if flag_table == num_o*2:

                        if flag_header == 0:

                            rows = mytable.findChildren(['th','tr'])

                            cells = rows[0].findChildren('td')

                            y = 2

                            for cell in cells:

                                TABLE_sheet.cell(row=x, column=y).value = cell.get_text()

                                worksheet.save(filepath)            

                                y = y + 1

                            x = x + 1

                            flag_header = 1

                        rows = mytable.findChildren(['th','tr'])

                        for row in rows[1:]:

                            cells = row.findChildren('td')

                            y = 2

                            cnt_line += 1

                            if cnt_line == num_line:

                                for cell in cells:

                                    TABLE_sheet.cell(row=1,column=1).value = table_name[num]

                                    TABLE_sheet.cell(row=2,column=1).value = 'HTML file name'

                                    TABLE_sheet.cell(row=x,column=1).value = fname_1[k]

                                    TABLE_sheet.cell(row=x, column=y).value = cell.get_text()

                                    worksheet.save(filepath)

                                    y = y + 1

                                x = x + 1

            worksheet.save(filepath)

            soup2.decompose()

            gc.collect()

# ================================================================

</table>

</UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt">

          Duration: 0.02 (s)

          <br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><a href="#TOC" xmlns:msxsl="urn:schemas-microsoft-com:xslt">Back to top

          </a><a name="22" xmlns:msxsl="urn:schemas-microsoft-com:xslt"></a><HR COLOR="#6699CC" SIZE="2" xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big><b>LTE_B1_ESC_APT</b></big></big><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big><b>Status:  </b></big><big><b><span style="color:#00CC00">PASS</span></b></big></UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">Run Info</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">

<table border="1">

<tr>

<td bgcolor="#FFFFCC" align="center"><b>Switchpoint Data</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Temp (C)</b></td>

<td bgcolor="#FFFFCC" align="center"><b>RFMode</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Technology</b></td>

<td bgcolor="#FFFFCC" align="center"><b>RF Mode</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Band</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Device</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Instance</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Calibration Mode</b></td>

<td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>

</tr>

<tr>

<td align="center">Streaming not supported</td>

<td align="center">22</td>

<td align="center">34</td>

<td align="center">LTE</td>

<td align="center">34</td>

<td align="center">1</td>

<td align="center">0</td>

<td align="center">0</td>

<td align="center">Composite</td>

<td align="center">2.17</td>

</tr>

</table>

</UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">Segment Lengths</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">

<table border="1">

<tr>

<td bgcolor="#FFFFCC" align="center"><b>Tx Lin Tx Seg Len (ms)</b></td>

<td bgcolor="#FFFFCC" align="center"><b>TxRx Seg Len (ms)</b></td>

<td bgcolor="#FFFFCC" align="center"><b>RF Config Seg Len (ms)</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Tuning Seg Len (ms)</b></td>

<td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>

</tr>

<tr>

<td align="center">4</td>

<td align="center">4</td>

<td align="center">60</td>

<td align="center">20</td>

<td><span style="color:white">-</span></td>

</tr>

</table>

</UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">MEASUREMENTS: Tx Linearizer Sweep</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">

<table border="1">

<tr>

<td bgcolor="#FFFFCC" align="center"><b>Channel</b></td>

<td bgcolor="#FFFFCC" align="center"><b>PA State</b></td>

<td bgcolor="#FFFFCC" align="center"><b>TxAGC</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Vcc</b></td>

<td bgcolor="#FFFFCC" align="center"><b>Icq</b></td>

<td bgcolor="#6699CC" align="center"><b>Power</b></td>

<td bgcolor="#6699CC" align="center"><b>HDET</b></td>

<td bgcolor="#6699CC" align="center"><b>LPM HDET</b></td>

<td bgcolor="#6699CC" align="center"><b>DeltaPwr</b></td>

<td bgcolor="#CCCCCC" align="center"><b>DeltaPwr Min</b></td>

<td bgcolor="#CCCCCC" align="center"><b>DeltaPwr Max</b></td>

<td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>

</tr>

<tr>

<td align="center">18300</td>

<td align="center">1</td>

<td align="center">68</td>

<td align="center">3000</td>

<td align="center">235</td>

<td align="center">26.7</td>

<td align="center">20180</td>

<td align="center">-</td>

<td align="center">0</td>

<td align="center">0</td>

<td align="center">0</td>

<td><span style="color:white">-</span></td>

</tr>

<tr>

<td align="center">18300</td>

<td align="center">1</td>

<td align="center">67</td>

<td align="center">3000</td>

<td align="center">235</td>

<td align="center">26.3</td>

<td align="center">18072</td>

<td align="center">-</td>

<td align="center">0.4</td>

<td align="center">-1</td>

<td align="center">8</td>

<td><span style="color:white">-</span></td>

</tr>

The error when I process more than 15 HTML files:

Exception MemoryError: MemoryError() in <generator object prepare_markup at 0x7CC15A80> ignored

Traceback (most recent call last):

  File ".html_parsing_Nov_01_max15_1_submit_to_SOF.py", line 297, in <module>

    soup2 = BeautifulSoup(f2, "html.parser")

  File "C:Python27libsite-packagesbeautifulsoup4-4.6.3-py2.7.eggbs4__init__.py", line 282, in __init__

    self._feed()

  File "C:Python27libsite-packagesbeautifulsoup4-4.6.3-py2.7.eggbs4__init__.py", line 343, in _feed

    self.builder.feed(self.markup)

MemoryError

Would you please take a look and give me your comments as possible?
Thank you very much.

html python-2.7 beautifulsoup out-of-memory

asked Jan 3 at 2:53

khoipt

asked Jan 3 at 2:53

khoipt

asked Jan 3 at 2:53

khoipt

asked Jan 3 at 2:53

khoipt

asked Jan 3 at 2:53

khoipt

add a comment |

0

active

oldest

votes

Your Answer

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f54015736%2fpython-beautifulsoup-html-memory-error-when-import-multiple-html-files%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

0

active

oldest

votes

0

active

oldest

votes

draft saved

draft discarded

Thanks for contributing an answer to Stack Overflow!

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

To learn more, see our tips on writing great answers.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

Search This Blog

Ufyukyu