Python-BeautifulSoup-HTML_Memory error when import multiple HTML files





.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty,.everyoneloves__bot-mid-leaderboard:empty{ height:90px;width:728px;box-sizing:border-box;
}







0















I am programming a python 2.7 script to import multiple HTML local files (each data files ~ 3MB).
I use the BeautifulSoup4 package but I meet the memory error when I import more than 15 HTML files.
My code is:



from bs4 import BeautifulSoup
import os
import xlwt
# ============= FIND THE FIRST HTML FILE ======= #

directory ='C:UserskhoiptDesktopdata_extractionCalib_SMP_100HTMLHTML1'
flag = 0
for filename in os.listdir(directory):
if filename.endswith('.HTML'):
fname_1 = os.path.join(directory,filename)
print('Filename: {}'.format(fname_1))
flag = 1
break

if flag == 0:
print('There is no HTML file')

fname_1_open = open(fname_1)
soup = BeautifulSoup(fname_1_open, "html.parser")
fname_1_open.close()
tables = soup.find_all("table")

# ================================================================
# ========= DECLARE EXCEL FILE ==================================
# Code for create a excel file
# ================================================================

# =========== FIND THE NUMBER OF HTML FILE ========
# Find the number of HTML files and print it to cnt_total_html
# print('The number of HTML files is: ', cnt_total_html)
# =================================================

# ====== Find the number of titles in HTML file =================
# Find the number of titles in HTML file and print it to counter_title
# Each title can contain many tables
# ================================================================

# ==== FIND THE NUMBER OF TABLES IN HTML FILE =======================
# Find the number of tables in HTML file and print it to counter_table
# ====================================================================

# =================================================================
print('Please select the desired title: ')
num = input()
print('The desired title is: ', table_name[num])

print('Please select the order of table: ')
num_o = input()
print('The desired order of table is: ', num_o)

print('Please select the order of line: ')
num_line = input()
print('The desired order of line is: ', num_line)
# ====================================================================

# ================ FIND AND PRINT THE DESIRED LINE TO EXCEL FILE ===========

cnt_total_html_all = cnt_total_html #cnt_total_html is the total html files
Matrix_table = [[0 for x in range(counter_table)] for y in range(cnt_total_html)]


line_no = [None]*counter_title
i = 0
j = 0
k = 0
m = 0
n = 0
flag_cnt_html = 1

x = 2
y = 2
flag_table = 0
flag_line = 0
flag_header = 0
cnt_line = 0

# Seperate the whole HTML files into 10-files block
while cnt_total_html > 10:
cnt_total_html = cnt_total_html - 10
flag_cnt_html += 1
print('So flag_cnt_html la: ',flag_cnt_html)
###################################################

for i in range(flag_cnt_html):
if (cnt_total_html_all - i*10) > 10:
TABLE_sheet=worksheet['TABLE']
for k in range(0+i*10,9+i*10):
f = open(fopen[k])
soup = BeautifulSoup(f, "html.parser")
Matrix_table[k] = soup.find_all("table") #Find all table in the HTML file
f.close()

table_no = [None]*counter_table
table_no =
m = 0
f = open(fopen[k])
line_num = 0
lines = f.readlines()
cnt = 0
for line in lines:
line_num = 1 + line_num
if line.find("table") >= 0:
table_no.append(line_num) #find line number of each table
cnt += 1
m += 1
f.close
start_no = Matrix[k][num] #find line number of the desired title
stop_no = Matrix[k][num+1] #find line number of the sibling desired title

m = 0
flag_table = 0
cnt_line = 0
for m in range(cnt):
if table_no[m] in range(start_no,stop_no):
i1 = (m - 1)/2
mytable = Matrix_table[k][i1]
flag_table += 1
if flag_table == num_o*2: #Check if the order of table in HTML file is the desired table
if flag_header == 0:
rows = mytable.findChildren(['th','tr'])
cells = rows[0].findChildren('td')
y = 2
for cell in cells:
TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
worksheet.save(filepath)
y = y + 1
x = x + 1
flag_header = 1
rows = mytable.findChildren(['th','tr'])
for row in rows[1:]:
cells = row.findChildren('td')
y = 2
cnt_line += 1
if cnt_line == num_line: #Check if the order of line in the table is the desired one
for cell in cells:
TABLE_sheet.cell(row=2,column=1).value = 'HTML file name' #Write to excel file
TABLE_sheet.cell(row=x,column=1).value = fname_1[k]
TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
worksheet.save(filepath)
y = y + 1
x = x + 1
worksheet.save(filepath)
soup.decompose()
gc.collect()
else:
gc.collect()
TABLE_sheet=worksheet['TABLE']
for k in range(0+i*10-1, cnt_total_html_all):
f2 = open(fopen[k])
soup2 = BeautifulSoup(f2, "html.parser")
Matrix_table[k] = soup2.find_all("table")
f2.close()
table_no =
m = 0
line_num = 0
f2 = open(fopen[k])
lines = f2.readlines()
cnt = 0
for line in lines:
line_num = 1 + line_num
if line.find("table") >= 0:
table_no.append(line_num)
cnt += 1
m += 1
f2.close()
start_no = Matrix[k][num]
stop_no = Matrix[k][num+1]
m = 0
flag_table = 0
cnt_line = 0
for m in range(cnt):
if table_no[m] in range(start_no,stop_no):
i1 = (m-1)/2
mytable = Matrix_table[k][i1]
flag_table += 1
if flag_table == num_o*2:
if flag_header == 0:
rows = mytable.findChildren(['th','tr'])
cells = rows[0].findChildren('td')
y = 2
for cell in cells:
TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
worksheet.save(filepath)
y = y + 1
x = x + 1
flag_header = 1
rows = mytable.findChildren(['th','tr'])
for row in rows[1:]:
cells = row.findChildren('td')
y = 2
cnt_line += 1
if cnt_line == num_line:
for cell in cells:
TABLE_sheet.cell(row=1,column=1).value = table_name[num]
TABLE_sheet.cell(row=2,column=1).value = 'HTML file name'
TABLE_sheet.cell(row=x,column=1).value = fname_1[k]
TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
worksheet.save(filepath)
y = y + 1
x = x + 1
worksheet.save(filepath)
soup2.decompose()
gc.collect()
# ================================================================


My HTML file is exported from the equipment and it does not look like traditional HTML files. It contains only table descriptions. Unfortunately, I cannot modify the structure of the exported HTML file.



</table>
</UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt">
Duration: 0.02 (s)
<br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><a href="#TOC" xmlns:msxsl="urn:schemas-microsoft-com:xslt">Back to top
</a><a name="22" xmlns:msxsl="urn:schemas-microsoft-com:xslt"></a><HR COLOR="#6699CC" SIZE="2" xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big><b>LTE_B1_ESC_APT</b></big></big><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big><b>Status: </b></big><big><b><span style="color:#00CC00">PASS</span></b></big></UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">Run Info</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">
<table border="1">
<tr>
<td bgcolor="#FFFFCC" align="center"><b>Switchpoint Data</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Temp (C)</b></td>
<td bgcolor="#FFFFCC" align="center"><b>RFMode</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Technology</b></td>
<td bgcolor="#FFFFCC" align="center"><b>RF Mode</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Band</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Device</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Instance</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Calibration Mode</b></td>
<td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>
</tr>
<tr>
<td align="center">Streaming not supported</td>
<td align="center">22</td>
<td align="center">34</td>
<td align="center">LTE</td>
<td align="center">34</td>
<td align="center">1</td>
<td align="center">0</td>
<td align="center">0</td>
<td align="center">Composite</td>
<td align="center">2.17</td>
</tr>
</table>
</UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">Segment Lengths</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">
<table border="1">
<tr>
<td bgcolor="#FFFFCC" align="center"><b>Tx Lin Tx Seg Len (ms)</b></td>
<td bgcolor="#FFFFCC" align="center"><b>TxRx Seg Len (ms)</b></td>
<td bgcolor="#FFFFCC" align="center"><b>RF Config Seg Len (ms)</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Tuning Seg Len (ms)</b></td>
<td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>
</tr>
<tr>
<td align="center">4</td>
<td align="center">4</td>
<td align="center">60</td>
<td align="center">20</td>
<td><span style="color:white">-</span></td>
</tr>
</table>
</UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">MEASUREMENTS: Tx Linearizer Sweep</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">
<table border="1">
<tr>
<td bgcolor="#FFFFCC" align="center"><b>Channel</b></td>
<td bgcolor="#FFFFCC" align="center"><b>PA State</b></td>
<td bgcolor="#FFFFCC" align="center"><b>TxAGC</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Vcc</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Icq</b></td>
<td bgcolor="#6699CC" align="center"><b>Power</b></td>
<td bgcolor="#6699CC" align="center"><b>HDET</b></td>
<td bgcolor="#6699CC" align="center"><b>LPM HDET</b></td>
<td bgcolor="#6699CC" align="center"><b>DeltaPwr</b></td>
<td bgcolor="#CCCCCC" align="center"><b>DeltaPwr Min</b></td>
<td bgcolor="#CCCCCC" align="center"><b>DeltaPwr Max</b></td>
<td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>
</tr>
<tr>
<td align="center">18300</td>
<td align="center">1</td>
<td align="center">68</td>
<td align="center">3000</td>
<td align="center">235</td>
<td align="center">26.7</td>
<td align="center">20180</td>
<td align="center">-</td>
<td align="center">0</td>
<td align="center">0</td>
<td align="center">0</td>
<td><span style="color:white">-</span></td>
</tr>
<tr>
<td align="center">18300</td>
<td align="center">1</td>
<td align="center">67</td>
<td align="center">3000</td>
<td align="center">235</td>
<td align="center">26.3</td>
<td align="center">18072</td>
<td align="center">-</td>
<td align="center">0.4</td>
<td align="center">-1</td>
<td align="center">8</td>
<td><span style="color:white">-</span></td>
</tr>


The error when I process more than 15 HTML files:



Exception MemoryError: MemoryError() in <generator object prepare_markup at 0x7CC15A80> ignored
Traceback (most recent call last):
File ".html_parsing_Nov_01_max15_1_submit_to_SOF.py", line 297, in <module>
soup2 = BeautifulSoup(f2, "html.parser")
File "C:Python27libsite-packagesbeautifulsoup4-4.6.3-py2.7.eggbs4__init__.py", line 282, in __init__
self._feed()
File "C:Python27libsite-packagesbeautifulsoup4-4.6.3-py2.7.eggbs4__init__.py", line 343, in _feed
self.builder.feed(self.markup)
MemoryError


I have tested that when I have just parsed the HTML file using BS4 without find_all table, the code can process more than 15 files. I also try to use the BS4 only for the first file and then find the line number of the desired contain, however, I can not find the line number of those lines using BS4.



Would you please take a look and give me your comments as possible?
Thank you very much.










share|improve this question





























    0















    I am programming a python 2.7 script to import multiple HTML local files (each data files ~ 3MB).
    I use the BeautifulSoup4 package but I meet the memory error when I import more than 15 HTML files.
    My code is:



    from bs4 import BeautifulSoup
    import os
    import xlwt
    # ============= FIND THE FIRST HTML FILE ======= #

    directory ='C:UserskhoiptDesktopdata_extractionCalib_SMP_100HTMLHTML1'
    flag = 0
    for filename in os.listdir(directory):
    if filename.endswith('.HTML'):
    fname_1 = os.path.join(directory,filename)
    print('Filename: {}'.format(fname_1))
    flag = 1
    break

    if flag == 0:
    print('There is no HTML file')

    fname_1_open = open(fname_1)
    soup = BeautifulSoup(fname_1_open, "html.parser")
    fname_1_open.close()
    tables = soup.find_all("table")

    # ================================================================
    # ========= DECLARE EXCEL FILE ==================================
    # Code for create a excel file
    # ================================================================

    # =========== FIND THE NUMBER OF HTML FILE ========
    # Find the number of HTML files and print it to cnt_total_html
    # print('The number of HTML files is: ', cnt_total_html)
    # =================================================

    # ====== Find the number of titles in HTML file =================
    # Find the number of titles in HTML file and print it to counter_title
    # Each title can contain many tables
    # ================================================================

    # ==== FIND THE NUMBER OF TABLES IN HTML FILE =======================
    # Find the number of tables in HTML file and print it to counter_table
    # ====================================================================

    # =================================================================
    print('Please select the desired title: ')
    num = input()
    print('The desired title is: ', table_name[num])

    print('Please select the order of table: ')
    num_o = input()
    print('The desired order of table is: ', num_o)

    print('Please select the order of line: ')
    num_line = input()
    print('The desired order of line is: ', num_line)
    # ====================================================================

    # ================ FIND AND PRINT THE DESIRED LINE TO EXCEL FILE ===========

    cnt_total_html_all = cnt_total_html #cnt_total_html is the total html files
    Matrix_table = [[0 for x in range(counter_table)] for y in range(cnt_total_html)]


    line_no = [None]*counter_title
    i = 0
    j = 0
    k = 0
    m = 0
    n = 0
    flag_cnt_html = 1

    x = 2
    y = 2
    flag_table = 0
    flag_line = 0
    flag_header = 0
    cnt_line = 0

    # Seperate the whole HTML files into 10-files block
    while cnt_total_html > 10:
    cnt_total_html = cnt_total_html - 10
    flag_cnt_html += 1
    print('So flag_cnt_html la: ',flag_cnt_html)
    ###################################################

    for i in range(flag_cnt_html):
    if (cnt_total_html_all - i*10) > 10:
    TABLE_sheet=worksheet['TABLE']
    for k in range(0+i*10,9+i*10):
    f = open(fopen[k])
    soup = BeautifulSoup(f, "html.parser")
    Matrix_table[k] = soup.find_all("table") #Find all table in the HTML file
    f.close()

    table_no = [None]*counter_table
    table_no =
    m = 0
    f = open(fopen[k])
    line_num = 0
    lines = f.readlines()
    cnt = 0
    for line in lines:
    line_num = 1 + line_num
    if line.find("table") >= 0:
    table_no.append(line_num) #find line number of each table
    cnt += 1
    m += 1
    f.close
    start_no = Matrix[k][num] #find line number of the desired title
    stop_no = Matrix[k][num+1] #find line number of the sibling desired title

    m = 0
    flag_table = 0
    cnt_line = 0
    for m in range(cnt):
    if table_no[m] in range(start_no,stop_no):
    i1 = (m - 1)/2
    mytable = Matrix_table[k][i1]
    flag_table += 1
    if flag_table == num_o*2: #Check if the order of table in HTML file is the desired table
    if flag_header == 0:
    rows = mytable.findChildren(['th','tr'])
    cells = rows[0].findChildren('td')
    y = 2
    for cell in cells:
    TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
    worksheet.save(filepath)
    y = y + 1
    x = x + 1
    flag_header = 1
    rows = mytable.findChildren(['th','tr'])
    for row in rows[1:]:
    cells = row.findChildren('td')
    y = 2
    cnt_line += 1
    if cnt_line == num_line: #Check if the order of line in the table is the desired one
    for cell in cells:
    TABLE_sheet.cell(row=2,column=1).value = 'HTML file name' #Write to excel file
    TABLE_sheet.cell(row=x,column=1).value = fname_1[k]
    TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
    worksheet.save(filepath)
    y = y + 1
    x = x + 1
    worksheet.save(filepath)
    soup.decompose()
    gc.collect()
    else:
    gc.collect()
    TABLE_sheet=worksheet['TABLE']
    for k in range(0+i*10-1, cnt_total_html_all):
    f2 = open(fopen[k])
    soup2 = BeautifulSoup(f2, "html.parser")
    Matrix_table[k] = soup2.find_all("table")
    f2.close()
    table_no =
    m = 0
    line_num = 0
    f2 = open(fopen[k])
    lines = f2.readlines()
    cnt = 0
    for line in lines:
    line_num = 1 + line_num
    if line.find("table") >= 0:
    table_no.append(line_num)
    cnt += 1
    m += 1
    f2.close()
    start_no = Matrix[k][num]
    stop_no = Matrix[k][num+1]
    m = 0
    flag_table = 0
    cnt_line = 0
    for m in range(cnt):
    if table_no[m] in range(start_no,stop_no):
    i1 = (m-1)/2
    mytable = Matrix_table[k][i1]
    flag_table += 1
    if flag_table == num_o*2:
    if flag_header == 0:
    rows = mytable.findChildren(['th','tr'])
    cells = rows[0].findChildren('td')
    y = 2
    for cell in cells:
    TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
    worksheet.save(filepath)
    y = y + 1
    x = x + 1
    flag_header = 1
    rows = mytable.findChildren(['th','tr'])
    for row in rows[1:]:
    cells = row.findChildren('td')
    y = 2
    cnt_line += 1
    if cnt_line == num_line:
    for cell in cells:
    TABLE_sheet.cell(row=1,column=1).value = table_name[num]
    TABLE_sheet.cell(row=2,column=1).value = 'HTML file name'
    TABLE_sheet.cell(row=x,column=1).value = fname_1[k]
    TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
    worksheet.save(filepath)
    y = y + 1
    x = x + 1
    worksheet.save(filepath)
    soup2.decompose()
    gc.collect()
    # ================================================================


    My HTML file is exported from the equipment and it does not look like traditional HTML files. It contains only table descriptions. Unfortunately, I cannot modify the structure of the exported HTML file.



    </table>
    </UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt">
    Duration: 0.02 (s)
    <br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><a href="#TOC" xmlns:msxsl="urn:schemas-microsoft-com:xslt">Back to top
    </a><a name="22" xmlns:msxsl="urn:schemas-microsoft-com:xslt"></a><HR COLOR="#6699CC" SIZE="2" xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big><b>LTE_B1_ESC_APT</b></big></big><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big><b>Status: </b></big><big><b><span style="color:#00CC00">PASS</span></b></big></UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">Run Info</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">
    <table border="1">
    <tr>
    <td bgcolor="#FFFFCC" align="center"><b>Switchpoint Data</b></td>
    <td bgcolor="#FFFFCC" align="center"><b>Temp (C)</b></td>
    <td bgcolor="#FFFFCC" align="center"><b>RFMode</b></td>
    <td bgcolor="#FFFFCC" align="center"><b>Technology</b></td>
    <td bgcolor="#FFFFCC" align="center"><b>RF Mode</b></td>
    <td bgcolor="#FFFFCC" align="center"><b>Band</b></td>
    <td bgcolor="#FFFFCC" align="center"><b>Device</b></td>
    <td bgcolor="#FFFFCC" align="center"><b>Instance</b></td>
    <td bgcolor="#FFFFCC" align="center"><b>Calibration Mode</b></td>
    <td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>
    </tr>
    <tr>
    <td align="center">Streaming not supported</td>
    <td align="center">22</td>
    <td align="center">34</td>
    <td align="center">LTE</td>
    <td align="center">34</td>
    <td align="center">1</td>
    <td align="center">0</td>
    <td align="center">0</td>
    <td align="center">Composite</td>
    <td align="center">2.17</td>
    </tr>
    </table>
    </UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">Segment Lengths</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">
    <table border="1">
    <tr>
    <td bgcolor="#FFFFCC" align="center"><b>Tx Lin Tx Seg Len (ms)</b></td>
    <td bgcolor="#FFFFCC" align="center"><b>TxRx Seg Len (ms)</b></td>
    <td bgcolor="#FFFFCC" align="center"><b>RF Config Seg Len (ms)</b></td>
    <td bgcolor="#FFFFCC" align="center"><b>Tuning Seg Len (ms)</b></td>
    <td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>
    </tr>
    <tr>
    <td align="center">4</td>
    <td align="center">4</td>
    <td align="center">60</td>
    <td align="center">20</td>
    <td><span style="color:white">-</span></td>
    </tr>
    </table>
    </UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">MEASUREMENTS: Tx Linearizer Sweep</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">
    <table border="1">
    <tr>
    <td bgcolor="#FFFFCC" align="center"><b>Channel</b></td>
    <td bgcolor="#FFFFCC" align="center"><b>PA State</b></td>
    <td bgcolor="#FFFFCC" align="center"><b>TxAGC</b></td>
    <td bgcolor="#FFFFCC" align="center"><b>Vcc</b></td>
    <td bgcolor="#FFFFCC" align="center"><b>Icq</b></td>
    <td bgcolor="#6699CC" align="center"><b>Power</b></td>
    <td bgcolor="#6699CC" align="center"><b>HDET</b></td>
    <td bgcolor="#6699CC" align="center"><b>LPM HDET</b></td>
    <td bgcolor="#6699CC" align="center"><b>DeltaPwr</b></td>
    <td bgcolor="#CCCCCC" align="center"><b>DeltaPwr Min</b></td>
    <td bgcolor="#CCCCCC" align="center"><b>DeltaPwr Max</b></td>
    <td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>
    </tr>
    <tr>
    <td align="center">18300</td>
    <td align="center">1</td>
    <td align="center">68</td>
    <td align="center">3000</td>
    <td align="center">235</td>
    <td align="center">26.7</td>
    <td align="center">20180</td>
    <td align="center">-</td>
    <td align="center">0</td>
    <td align="center">0</td>
    <td align="center">0</td>
    <td><span style="color:white">-</span></td>
    </tr>
    <tr>
    <td align="center">18300</td>
    <td align="center">1</td>
    <td align="center">67</td>
    <td align="center">3000</td>
    <td align="center">235</td>
    <td align="center">26.3</td>
    <td align="center">18072</td>
    <td align="center">-</td>
    <td align="center">0.4</td>
    <td align="center">-1</td>
    <td align="center">8</td>
    <td><span style="color:white">-</span></td>
    </tr>


    The error when I process more than 15 HTML files:



    Exception MemoryError: MemoryError() in <generator object prepare_markup at 0x7CC15A80> ignored
    Traceback (most recent call last):
    File ".html_parsing_Nov_01_max15_1_submit_to_SOF.py", line 297, in <module>
    soup2 = BeautifulSoup(f2, "html.parser")
    File "C:Python27libsite-packagesbeautifulsoup4-4.6.3-py2.7.eggbs4__init__.py", line 282, in __init__
    self._feed()
    File "C:Python27libsite-packagesbeautifulsoup4-4.6.3-py2.7.eggbs4__init__.py", line 343, in _feed
    self.builder.feed(self.markup)
    MemoryError


    I have tested that when I have just parsed the HTML file using BS4 without find_all table, the code can process more than 15 files. I also try to use the BS4 only for the first file and then find the line number of the desired contain, however, I can not find the line number of those lines using BS4.



    Would you please take a look and give me your comments as possible?
    Thank you very much.










    share|improve this question

























      0












      0








      0








      I am programming a python 2.7 script to import multiple HTML local files (each data files ~ 3MB).
      I use the BeautifulSoup4 package but I meet the memory error when I import more than 15 HTML files.
      My code is:



      from bs4 import BeautifulSoup
      import os
      import xlwt
      # ============= FIND THE FIRST HTML FILE ======= #

      directory ='C:UserskhoiptDesktopdata_extractionCalib_SMP_100HTMLHTML1'
      flag = 0
      for filename in os.listdir(directory):
      if filename.endswith('.HTML'):
      fname_1 = os.path.join(directory,filename)
      print('Filename: {}'.format(fname_1))
      flag = 1
      break

      if flag == 0:
      print('There is no HTML file')

      fname_1_open = open(fname_1)
      soup = BeautifulSoup(fname_1_open, "html.parser")
      fname_1_open.close()
      tables = soup.find_all("table")

      # ================================================================
      # ========= DECLARE EXCEL FILE ==================================
      # Code for create a excel file
      # ================================================================

      # =========== FIND THE NUMBER OF HTML FILE ========
      # Find the number of HTML files and print it to cnt_total_html
      # print('The number of HTML files is: ', cnt_total_html)
      # =================================================

      # ====== Find the number of titles in HTML file =================
      # Find the number of titles in HTML file and print it to counter_title
      # Each title can contain many tables
      # ================================================================

      # ==== FIND THE NUMBER OF TABLES IN HTML FILE =======================
      # Find the number of tables in HTML file and print it to counter_table
      # ====================================================================

      # =================================================================
      print('Please select the desired title: ')
      num = input()
      print('The desired title is: ', table_name[num])

      print('Please select the order of table: ')
      num_o = input()
      print('The desired order of table is: ', num_o)

      print('Please select the order of line: ')
      num_line = input()
      print('The desired order of line is: ', num_line)
      # ====================================================================

      # ================ FIND AND PRINT THE DESIRED LINE TO EXCEL FILE ===========

      cnt_total_html_all = cnt_total_html #cnt_total_html is the total html files
      Matrix_table = [[0 for x in range(counter_table)] for y in range(cnt_total_html)]


      line_no = [None]*counter_title
      i = 0
      j = 0
      k = 0
      m = 0
      n = 0
      flag_cnt_html = 1

      x = 2
      y = 2
      flag_table = 0
      flag_line = 0
      flag_header = 0
      cnt_line = 0

      # Seperate the whole HTML files into 10-files block
      while cnt_total_html > 10:
      cnt_total_html = cnt_total_html - 10
      flag_cnt_html += 1
      print('So flag_cnt_html la: ',flag_cnt_html)
      ###################################################

      for i in range(flag_cnt_html):
      if (cnt_total_html_all - i*10) > 10:
      TABLE_sheet=worksheet['TABLE']
      for k in range(0+i*10,9+i*10):
      f = open(fopen[k])
      soup = BeautifulSoup(f, "html.parser")
      Matrix_table[k] = soup.find_all("table") #Find all table in the HTML file
      f.close()

      table_no = [None]*counter_table
      table_no =
      m = 0
      f = open(fopen[k])
      line_num = 0
      lines = f.readlines()
      cnt = 0
      for line in lines:
      line_num = 1 + line_num
      if line.find("table") >= 0:
      table_no.append(line_num) #find line number of each table
      cnt += 1
      m += 1
      f.close
      start_no = Matrix[k][num] #find line number of the desired title
      stop_no = Matrix[k][num+1] #find line number of the sibling desired title

      m = 0
      flag_table = 0
      cnt_line = 0
      for m in range(cnt):
      if table_no[m] in range(start_no,stop_no):
      i1 = (m - 1)/2
      mytable = Matrix_table[k][i1]
      flag_table += 1
      if flag_table == num_o*2: #Check if the order of table in HTML file is the desired table
      if flag_header == 0:
      rows = mytable.findChildren(['th','tr'])
      cells = rows[0].findChildren('td')
      y = 2
      for cell in cells:
      TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
      worksheet.save(filepath)
      y = y + 1
      x = x + 1
      flag_header = 1
      rows = mytable.findChildren(['th','tr'])
      for row in rows[1:]:
      cells = row.findChildren('td')
      y = 2
      cnt_line += 1
      if cnt_line == num_line: #Check if the order of line in the table is the desired one
      for cell in cells:
      TABLE_sheet.cell(row=2,column=1).value = 'HTML file name' #Write to excel file
      TABLE_sheet.cell(row=x,column=1).value = fname_1[k]
      TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
      worksheet.save(filepath)
      y = y + 1
      x = x + 1
      worksheet.save(filepath)
      soup.decompose()
      gc.collect()
      else:
      gc.collect()
      TABLE_sheet=worksheet['TABLE']
      for k in range(0+i*10-1, cnt_total_html_all):
      f2 = open(fopen[k])
      soup2 = BeautifulSoup(f2, "html.parser")
      Matrix_table[k] = soup2.find_all("table")
      f2.close()
      table_no =
      m = 0
      line_num = 0
      f2 = open(fopen[k])
      lines = f2.readlines()
      cnt = 0
      for line in lines:
      line_num = 1 + line_num
      if line.find("table") >= 0:
      table_no.append(line_num)
      cnt += 1
      m += 1
      f2.close()
      start_no = Matrix[k][num]
      stop_no = Matrix[k][num+1]
      m = 0
      flag_table = 0
      cnt_line = 0
      for m in range(cnt):
      if table_no[m] in range(start_no,stop_no):
      i1 = (m-1)/2
      mytable = Matrix_table[k][i1]
      flag_table += 1
      if flag_table == num_o*2:
      if flag_header == 0:
      rows = mytable.findChildren(['th','tr'])
      cells = rows[0].findChildren('td')
      y = 2
      for cell in cells:
      TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
      worksheet.save(filepath)
      y = y + 1
      x = x + 1
      flag_header = 1
      rows = mytable.findChildren(['th','tr'])
      for row in rows[1:]:
      cells = row.findChildren('td')
      y = 2
      cnt_line += 1
      if cnt_line == num_line:
      for cell in cells:
      TABLE_sheet.cell(row=1,column=1).value = table_name[num]
      TABLE_sheet.cell(row=2,column=1).value = 'HTML file name'
      TABLE_sheet.cell(row=x,column=1).value = fname_1[k]
      TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
      worksheet.save(filepath)
      y = y + 1
      x = x + 1
      worksheet.save(filepath)
      soup2.decompose()
      gc.collect()
      # ================================================================


      My HTML file is exported from the equipment and it does not look like traditional HTML files. It contains only table descriptions. Unfortunately, I cannot modify the structure of the exported HTML file.



      </table>
      </UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt">
      Duration: 0.02 (s)
      <br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><a href="#TOC" xmlns:msxsl="urn:schemas-microsoft-com:xslt">Back to top
      </a><a name="22" xmlns:msxsl="urn:schemas-microsoft-com:xslt"></a><HR COLOR="#6699CC" SIZE="2" xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big><b>LTE_B1_ESC_APT</b></big></big><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big><b>Status: </b></big><big><b><span style="color:#00CC00">PASS</span></b></big></UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">Run Info</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">
      <table border="1">
      <tr>
      <td bgcolor="#FFFFCC" align="center"><b>Switchpoint Data</b></td>
      <td bgcolor="#FFFFCC" align="center"><b>Temp (C)</b></td>
      <td bgcolor="#FFFFCC" align="center"><b>RFMode</b></td>
      <td bgcolor="#FFFFCC" align="center"><b>Technology</b></td>
      <td bgcolor="#FFFFCC" align="center"><b>RF Mode</b></td>
      <td bgcolor="#FFFFCC" align="center"><b>Band</b></td>
      <td bgcolor="#FFFFCC" align="center"><b>Device</b></td>
      <td bgcolor="#FFFFCC" align="center"><b>Instance</b></td>
      <td bgcolor="#FFFFCC" align="center"><b>Calibration Mode</b></td>
      <td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>
      </tr>
      <tr>
      <td align="center">Streaming not supported</td>
      <td align="center">22</td>
      <td align="center">34</td>
      <td align="center">LTE</td>
      <td align="center">34</td>
      <td align="center">1</td>
      <td align="center">0</td>
      <td align="center">0</td>
      <td align="center">Composite</td>
      <td align="center">2.17</td>
      </tr>
      </table>
      </UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">Segment Lengths</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">
      <table border="1">
      <tr>
      <td bgcolor="#FFFFCC" align="center"><b>Tx Lin Tx Seg Len (ms)</b></td>
      <td bgcolor="#FFFFCC" align="center"><b>TxRx Seg Len (ms)</b></td>
      <td bgcolor="#FFFFCC" align="center"><b>RF Config Seg Len (ms)</b></td>
      <td bgcolor="#FFFFCC" align="center"><b>Tuning Seg Len (ms)</b></td>
      <td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>
      </tr>
      <tr>
      <td align="center">4</td>
      <td align="center">4</td>
      <td align="center">60</td>
      <td align="center">20</td>
      <td><span style="color:white">-</span></td>
      </tr>
      </table>
      </UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">MEASUREMENTS: Tx Linearizer Sweep</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">
      <table border="1">
      <tr>
      <td bgcolor="#FFFFCC" align="center"><b>Channel</b></td>
      <td bgcolor="#FFFFCC" align="center"><b>PA State</b></td>
      <td bgcolor="#FFFFCC" align="center"><b>TxAGC</b></td>
      <td bgcolor="#FFFFCC" align="center"><b>Vcc</b></td>
      <td bgcolor="#FFFFCC" align="center"><b>Icq</b></td>
      <td bgcolor="#6699CC" align="center"><b>Power</b></td>
      <td bgcolor="#6699CC" align="center"><b>HDET</b></td>
      <td bgcolor="#6699CC" align="center"><b>LPM HDET</b></td>
      <td bgcolor="#6699CC" align="center"><b>DeltaPwr</b></td>
      <td bgcolor="#CCCCCC" align="center"><b>DeltaPwr Min</b></td>
      <td bgcolor="#CCCCCC" align="center"><b>DeltaPwr Max</b></td>
      <td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>
      </tr>
      <tr>
      <td align="center">18300</td>
      <td align="center">1</td>
      <td align="center">68</td>
      <td align="center">3000</td>
      <td align="center">235</td>
      <td align="center">26.7</td>
      <td align="center">20180</td>
      <td align="center">-</td>
      <td align="center">0</td>
      <td align="center">0</td>
      <td align="center">0</td>
      <td><span style="color:white">-</span></td>
      </tr>
      <tr>
      <td align="center">18300</td>
      <td align="center">1</td>
      <td align="center">67</td>
      <td align="center">3000</td>
      <td align="center">235</td>
      <td align="center">26.3</td>
      <td align="center">18072</td>
      <td align="center">-</td>
      <td align="center">0.4</td>
      <td align="center">-1</td>
      <td align="center">8</td>
      <td><span style="color:white">-</span></td>
      </tr>


      The error when I process more than 15 HTML files:



      Exception MemoryError: MemoryError() in <generator object prepare_markup at 0x7CC15A80> ignored
      Traceback (most recent call last):
      File ".html_parsing_Nov_01_max15_1_submit_to_SOF.py", line 297, in <module>
      soup2 = BeautifulSoup(f2, "html.parser")
      File "C:Python27libsite-packagesbeautifulsoup4-4.6.3-py2.7.eggbs4__init__.py", line 282, in __init__
      self._feed()
      File "C:Python27libsite-packagesbeautifulsoup4-4.6.3-py2.7.eggbs4__init__.py", line 343, in _feed
      self.builder.feed(self.markup)
      MemoryError


      I have tested that when I have just parsed the HTML file using BS4 without find_all table, the code can process more than 15 files. I also try to use the BS4 only for the first file and then find the line number of the desired contain, however, I can not find the line number of those lines using BS4.



      Would you please take a look and give me your comments as possible?
      Thank you very much.










      share|improve this question














      I am programming a python 2.7 script to import multiple HTML local files (each data files ~ 3MB).
      I use the BeautifulSoup4 package but I meet the memory error when I import more than 15 HTML files.
      My code is:



      from bs4 import BeautifulSoup
      import os
      import xlwt
      # ============= FIND THE FIRST HTML FILE ======= #

      directory ='C:UserskhoiptDesktopdata_extractionCalib_SMP_100HTMLHTML1'
      flag = 0
      for filename in os.listdir(directory):
      if filename.endswith('.HTML'):
      fname_1 = os.path.join(directory,filename)
      print('Filename: {}'.format(fname_1))
      flag = 1
      break

      if flag == 0:
      print('There is no HTML file')

      fname_1_open = open(fname_1)
      soup = BeautifulSoup(fname_1_open, "html.parser")
      fname_1_open.close()
      tables = soup.find_all("table")

      # ================================================================
      # ========= DECLARE EXCEL FILE ==================================
      # Code for create a excel file
      # ================================================================

      # =========== FIND THE NUMBER OF HTML FILE ========
      # Find the number of HTML files and print it to cnt_total_html
      # print('The number of HTML files is: ', cnt_total_html)
      # =================================================

      # ====== Find the number of titles in HTML file =================
      # Find the number of titles in HTML file and print it to counter_title
      # Each title can contain many tables
      # ================================================================

      # ==== FIND THE NUMBER OF TABLES IN HTML FILE =======================
      # Find the number of tables in HTML file and print it to counter_table
      # ====================================================================

      # =================================================================
      print('Please select the desired title: ')
      num = input()
      print('The desired title is: ', table_name[num])

      print('Please select the order of table: ')
      num_o = input()
      print('The desired order of table is: ', num_o)

      print('Please select the order of line: ')
      num_line = input()
      print('The desired order of line is: ', num_line)
      # ====================================================================

      # ================ FIND AND PRINT THE DESIRED LINE TO EXCEL FILE ===========

      cnt_total_html_all = cnt_total_html #cnt_total_html is the total html files
      Matrix_table = [[0 for x in range(counter_table)] for y in range(cnt_total_html)]


      line_no = [None]*counter_title
      i = 0
      j = 0
      k = 0
      m = 0
      n = 0
      flag_cnt_html = 1

      x = 2
      y = 2
      flag_table = 0
      flag_line = 0
      flag_header = 0
      cnt_line = 0

      # Seperate the whole HTML files into 10-files block
      while cnt_total_html > 10:
      cnt_total_html = cnt_total_html - 10
      flag_cnt_html += 1
      print('So flag_cnt_html la: ',flag_cnt_html)
      ###################################################

      for i in range(flag_cnt_html):
      if (cnt_total_html_all - i*10) > 10:
      TABLE_sheet=worksheet['TABLE']
      for k in range(0+i*10,9+i*10):
      f = open(fopen[k])
      soup = BeautifulSoup(f, "html.parser")
      Matrix_table[k] = soup.find_all("table") #Find all table in the HTML file
      f.close()

      table_no = [None]*counter_table
      table_no =
      m = 0
      f = open(fopen[k])
      line_num = 0
      lines = f.readlines()
      cnt = 0
      for line in lines:
      line_num = 1 + line_num
      if line.find("table") >= 0:
      table_no.append(line_num) #find line number of each table
      cnt += 1
      m += 1
      f.close
      start_no = Matrix[k][num] #find line number of the desired title
      stop_no = Matrix[k][num+1] #find line number of the sibling desired title

      m = 0
      flag_table = 0
      cnt_line = 0
      for m in range(cnt):
      if table_no[m] in range(start_no,stop_no):
      i1 = (m - 1)/2
      mytable = Matrix_table[k][i1]
      flag_table += 1
      if flag_table == num_o*2: #Check if the order of table in HTML file is the desired table
      if flag_header == 0:
      rows = mytable.findChildren(['th','tr'])
      cells = rows[0].findChildren('td')
      y = 2
      for cell in cells:
      TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
      worksheet.save(filepath)
      y = y + 1
      x = x + 1
      flag_header = 1
      rows = mytable.findChildren(['th','tr'])
      for row in rows[1:]:
      cells = row.findChildren('td')
      y = 2
      cnt_line += 1
      if cnt_line == num_line: #Check if the order of line in the table is the desired one
      for cell in cells:
      TABLE_sheet.cell(row=2,column=1).value = 'HTML file name' #Write to excel file
      TABLE_sheet.cell(row=x,column=1).value = fname_1[k]
      TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
      worksheet.save(filepath)
      y = y + 1
      x = x + 1
      worksheet.save(filepath)
      soup.decompose()
      gc.collect()
      else:
      gc.collect()
      TABLE_sheet=worksheet['TABLE']
      for k in range(0+i*10-1, cnt_total_html_all):
      f2 = open(fopen[k])
      soup2 = BeautifulSoup(f2, "html.parser")
      Matrix_table[k] = soup2.find_all("table")
      f2.close()
      table_no =
      m = 0
      line_num = 0
      f2 = open(fopen[k])
      lines = f2.readlines()
      cnt = 0
      for line in lines:
      line_num = 1 + line_num
      if line.find("table") >= 0:
      table_no.append(line_num)
      cnt += 1
      m += 1
      f2.close()
      start_no = Matrix[k][num]
      stop_no = Matrix[k][num+1]
      m = 0
      flag_table = 0
      cnt_line = 0
      for m in range(cnt):
      if table_no[m] in range(start_no,stop_no):
      i1 = (m-1)/2
      mytable = Matrix_table[k][i1]
      flag_table += 1
      if flag_table == num_o*2:
      if flag_header == 0:
      rows = mytable.findChildren(['th','tr'])
      cells = rows[0].findChildren('td')
      y = 2
      for cell in cells:
      TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
      worksheet.save(filepath)
      y = y + 1
      x = x + 1
      flag_header = 1
      rows = mytable.findChildren(['th','tr'])
      for row in rows[1:]:
      cells = row.findChildren('td')
      y = 2
      cnt_line += 1
      if cnt_line == num_line:
      for cell in cells:
      TABLE_sheet.cell(row=1,column=1).value = table_name[num]
      TABLE_sheet.cell(row=2,column=1).value = 'HTML file name'
      TABLE_sheet.cell(row=x,column=1).value = fname_1[k]
      TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
      worksheet.save(filepath)
      y = y + 1
      x = x + 1
      worksheet.save(filepath)
      soup2.decompose()
      gc.collect()
      # ================================================================


      My HTML file is exported from the equipment and it does not look like traditional HTML files. It contains only table descriptions. Unfortunately, I cannot modify the structure of the exported HTML file.



      </table>
      </UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt">
      Duration: 0.02 (s)
      <br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><a href="#TOC" xmlns:msxsl="urn:schemas-microsoft-com:xslt">Back to top
      </a><a name="22" xmlns:msxsl="urn:schemas-microsoft-com:xslt"></a><HR COLOR="#6699CC" SIZE="2" xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big><b>LTE_B1_ESC_APT</b></big></big><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big><b>Status: </b></big><big><b><span style="color:#00CC00">PASS</span></b></big></UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">Run Info</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">
      <table border="1">
      <tr>
      <td bgcolor="#FFFFCC" align="center"><b>Switchpoint Data</b></td>
      <td bgcolor="#FFFFCC" align="center"><b>Temp (C)</b></td>
      <td bgcolor="#FFFFCC" align="center"><b>RFMode</b></td>
      <td bgcolor="#FFFFCC" align="center"><b>Technology</b></td>
      <td bgcolor="#FFFFCC" align="center"><b>RF Mode</b></td>
      <td bgcolor="#FFFFCC" align="center"><b>Band</b></td>
      <td bgcolor="#FFFFCC" align="center"><b>Device</b></td>
      <td bgcolor="#FFFFCC" align="center"><b>Instance</b></td>
      <td bgcolor="#FFFFCC" align="center"><b>Calibration Mode</b></td>
      <td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>
      </tr>
      <tr>
      <td align="center">Streaming not supported</td>
      <td align="center">22</td>
      <td align="center">34</td>
      <td align="center">LTE</td>
      <td align="center">34</td>
      <td align="center">1</td>
      <td align="center">0</td>
      <td align="center">0</td>
      <td align="center">Composite</td>
      <td align="center">2.17</td>
      </tr>
      </table>
      </UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">Segment Lengths</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">
      <table border="1">
      <tr>
      <td bgcolor="#FFFFCC" align="center"><b>Tx Lin Tx Seg Len (ms)</b></td>
      <td bgcolor="#FFFFCC" align="center"><b>TxRx Seg Len (ms)</b></td>
      <td bgcolor="#FFFFCC" align="center"><b>RF Config Seg Len (ms)</b></td>
      <td bgcolor="#FFFFCC" align="center"><b>Tuning Seg Len (ms)</b></td>
      <td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>
      </tr>
      <tr>
      <td align="center">4</td>
      <td align="center">4</td>
      <td align="center">60</td>
      <td align="center">20</td>
      <td><span style="color:white">-</span></td>
      </tr>
      </table>
      </UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">MEASUREMENTS: Tx Linearizer Sweep</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">
      <table border="1">
      <tr>
      <td bgcolor="#FFFFCC" align="center"><b>Channel</b></td>
      <td bgcolor="#FFFFCC" align="center"><b>PA State</b></td>
      <td bgcolor="#FFFFCC" align="center"><b>TxAGC</b></td>
      <td bgcolor="#FFFFCC" align="center"><b>Vcc</b></td>
      <td bgcolor="#FFFFCC" align="center"><b>Icq</b></td>
      <td bgcolor="#6699CC" align="center"><b>Power</b></td>
      <td bgcolor="#6699CC" align="center"><b>HDET</b></td>
      <td bgcolor="#6699CC" align="center"><b>LPM HDET</b></td>
      <td bgcolor="#6699CC" align="center"><b>DeltaPwr</b></td>
      <td bgcolor="#CCCCCC" align="center"><b>DeltaPwr Min</b></td>
      <td bgcolor="#CCCCCC" align="center"><b>DeltaPwr Max</b></td>
      <td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>
      </tr>
      <tr>
      <td align="center">18300</td>
      <td align="center">1</td>
      <td align="center">68</td>
      <td align="center">3000</td>
      <td align="center">235</td>
      <td align="center">26.7</td>
      <td align="center">20180</td>
      <td align="center">-</td>
      <td align="center">0</td>
      <td align="center">0</td>
      <td align="center">0</td>
      <td><span style="color:white">-</span></td>
      </tr>
      <tr>
      <td align="center">18300</td>
      <td align="center">1</td>
      <td align="center">67</td>
      <td align="center">3000</td>
      <td align="center">235</td>
      <td align="center">26.3</td>
      <td align="center">18072</td>
      <td align="center">-</td>
      <td align="center">0.4</td>
      <td align="center">-1</td>
      <td align="center">8</td>
      <td><span style="color:white">-</span></td>
      </tr>


      The error when I process more than 15 HTML files:



      Exception MemoryError: MemoryError() in <generator object prepare_markup at 0x7CC15A80> ignored
      Traceback (most recent call last):
      File ".html_parsing_Nov_01_max15_1_submit_to_SOF.py", line 297, in <module>
      soup2 = BeautifulSoup(f2, "html.parser")
      File "C:Python27libsite-packagesbeautifulsoup4-4.6.3-py2.7.eggbs4__init__.py", line 282, in __init__
      self._feed()
      File "C:Python27libsite-packagesbeautifulsoup4-4.6.3-py2.7.eggbs4__init__.py", line 343, in _feed
      self.builder.feed(self.markup)
      MemoryError


      I have tested that when I have just parsed the HTML file using BS4 without find_all table, the code can process more than 15 files. I also try to use the BS4 only for the first file and then find the line number of the desired contain, however, I can not find the line number of those lines using BS4.



      Would you please take a look and give me your comments as possible?
      Thank you very much.







      html python-2.7 beautifulsoup out-of-memory






      share|improve this question













      share|improve this question











      share|improve this question




      share|improve this question










      asked Jan 3 at 2:53









      khoiptkhoipt

      11




      11
























          0






          active

          oldest

          votes












          Your Answer






          StackExchange.ifUsing("editor", function () {
          StackExchange.using("externalEditor", function () {
          StackExchange.using("snippets", function () {
          StackExchange.snippets.init();
          });
          });
          }, "code-snippets");

          StackExchange.ready(function() {
          var channelOptions = {
          tags: "".split(" "),
          id: "1"
          };
          initTagRenderer("".split(" "), "".split(" "), channelOptions);

          StackExchange.using("externalEditor", function() {
          // Have to fire editor after snippets, if snippets enabled
          if (StackExchange.settings.snippets.snippetsEnabled) {
          StackExchange.using("snippets", function() {
          createEditor();
          });
          }
          else {
          createEditor();
          }
          });

          function createEditor() {
          StackExchange.prepareEditor({
          heartbeatType: 'answer',
          autoActivateHeartbeat: false,
          convertImagesToLinks: true,
          noModals: true,
          showLowRepImageUploadWarning: true,
          reputationToPostImages: 10,
          bindNavPrevention: true,
          postfix: "",
          imageUploader: {
          brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
          contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
          allowUrls: true
          },
          onDemand: true,
          discardSelector: ".discard-answer"
          ,immediatelyShowMarkdownHelp:true
          });


          }
          });














          draft saved

          draft discarded


















          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f54015736%2fpython-beautifulsoup-html-memory-error-when-import-multiple-html-files%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown

























          0






          active

          oldest

          votes








          0






          active

          oldest

          votes









          active

          oldest

          votes






          active

          oldest

          votes
















          draft saved

          draft discarded




















































          Thanks for contributing an answer to Stack Overflow!


          • Please be sure to answer the question. Provide details and share your research!

          But avoid



          • Asking for help, clarification, or responding to other answers.

          • Making statements based on opinion; back them up with references or personal experience.


          To learn more, see our tips on writing great answers.




          draft saved


          draft discarded














          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f54015736%2fpython-beautifulsoup-html-memory-error-when-import-multiple-html-files%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown





















































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown

































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown







          Popular posts from this blog

          Can a sorcerer learn a 5th-level spell early by creating spell slots using the Font of Magic feature?

          ts Property 'filter' does not exist on type '{}'

          mat-slide-toggle shouldn't change it's state when I click cancel in confirmation window