Python-BeautifulSoup-HTML_Memory error when import multiple HTML files
.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty,.everyoneloves__bot-mid-leaderboard:empty{ height:90px;width:728px;box-sizing:border-box;
}
I am programming a python 2.7 script to import multiple HTML local files (each data files ~ 3MB).
I use the BeautifulSoup4 package but I meet the memory error when I import more than 15 HTML files.
My code is:
from bs4 import BeautifulSoup
import os
import xlwt
# ============= FIND THE FIRST HTML FILE ======= #
directory ='C:UserskhoiptDesktopdata_extractionCalib_SMP_100HTMLHTML1'
flag = 0
for filename in os.listdir(directory):
if filename.endswith('.HTML'):
fname_1 = os.path.join(directory,filename)
print('Filename: {}'.format(fname_1))
flag = 1
break
if flag == 0:
print('There is no HTML file')
fname_1_open = open(fname_1)
soup = BeautifulSoup(fname_1_open, "html.parser")
fname_1_open.close()
tables = soup.find_all("table")
# ================================================================
# ========= DECLARE EXCEL FILE ==================================
# Code for create a excel file
# ================================================================
# =========== FIND THE NUMBER OF HTML FILE ========
# Find the number of HTML files and print it to cnt_total_html
# print('The number of HTML files is: ', cnt_total_html)
# =================================================
# ====== Find the number of titles in HTML file =================
# Find the number of titles in HTML file and print it to counter_title
# Each title can contain many tables
# ================================================================
# ==== FIND THE NUMBER OF TABLES IN HTML FILE =======================
# Find the number of tables in HTML file and print it to counter_table
# ====================================================================
# =================================================================
print('Please select the desired title: ')
num = input()
print('The desired title is: ', table_name[num])
print('Please select the order of table: ')
num_o = input()
print('The desired order of table is: ', num_o)
print('Please select the order of line: ')
num_line = input()
print('The desired order of line is: ', num_line)
# ====================================================================
# ================ FIND AND PRINT THE DESIRED LINE TO EXCEL FILE ===========
cnt_total_html_all = cnt_total_html #cnt_total_html is the total html files
Matrix_table = [[0 for x in range(counter_table)] for y in range(cnt_total_html)]
line_no = [None]*counter_title
i = 0
j = 0
k = 0
m = 0
n = 0
flag_cnt_html = 1
x = 2
y = 2
flag_table = 0
flag_line = 0
flag_header = 0
cnt_line = 0
# Seperate the whole HTML files into 10-files block
while cnt_total_html > 10:
cnt_total_html = cnt_total_html - 10
flag_cnt_html += 1
print('So flag_cnt_html la: ',flag_cnt_html)
###################################################
for i in range(flag_cnt_html):
if (cnt_total_html_all - i*10) > 10:
TABLE_sheet=worksheet['TABLE']
for k in range(0+i*10,9+i*10):
f = open(fopen[k])
soup = BeautifulSoup(f, "html.parser")
Matrix_table[k] = soup.find_all("table") #Find all table in the HTML file
f.close()
table_no = [None]*counter_table
table_no =
m = 0
f = open(fopen[k])
line_num = 0
lines = f.readlines()
cnt = 0
for line in lines:
line_num = 1 + line_num
if line.find("table") >= 0:
table_no.append(line_num) #find line number of each table
cnt += 1
m += 1
f.close
start_no = Matrix[k][num] #find line number of the desired title
stop_no = Matrix[k][num+1] #find line number of the sibling desired title
m = 0
flag_table = 0
cnt_line = 0
for m in range(cnt):
if table_no[m] in range(start_no,stop_no):
i1 = (m - 1)/2
mytable = Matrix_table[k][i1]
flag_table += 1
if flag_table == num_o*2: #Check if the order of table in HTML file is the desired table
if flag_header == 0:
rows = mytable.findChildren(['th','tr'])
cells = rows[0].findChildren('td')
y = 2
for cell in cells:
TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
worksheet.save(filepath)
y = y + 1
x = x + 1
flag_header = 1
rows = mytable.findChildren(['th','tr'])
for row in rows[1:]:
cells = row.findChildren('td')
y = 2
cnt_line += 1
if cnt_line == num_line: #Check if the order of line in the table is the desired one
for cell in cells:
TABLE_sheet.cell(row=2,column=1).value = 'HTML file name' #Write to excel file
TABLE_sheet.cell(row=x,column=1).value = fname_1[k]
TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
worksheet.save(filepath)
y = y + 1
x = x + 1
worksheet.save(filepath)
soup.decompose()
gc.collect()
else:
gc.collect()
TABLE_sheet=worksheet['TABLE']
for k in range(0+i*10-1, cnt_total_html_all):
f2 = open(fopen[k])
soup2 = BeautifulSoup(f2, "html.parser")
Matrix_table[k] = soup2.find_all("table")
f2.close()
table_no =
m = 0
line_num = 0
f2 = open(fopen[k])
lines = f2.readlines()
cnt = 0
for line in lines:
line_num = 1 + line_num
if line.find("table") >= 0:
table_no.append(line_num)
cnt += 1
m += 1
f2.close()
start_no = Matrix[k][num]
stop_no = Matrix[k][num+1]
m = 0
flag_table = 0
cnt_line = 0
for m in range(cnt):
if table_no[m] in range(start_no,stop_no):
i1 = (m-1)/2
mytable = Matrix_table[k][i1]
flag_table += 1
if flag_table == num_o*2:
if flag_header == 0:
rows = mytable.findChildren(['th','tr'])
cells = rows[0].findChildren('td')
y = 2
for cell in cells:
TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
worksheet.save(filepath)
y = y + 1
x = x + 1
flag_header = 1
rows = mytable.findChildren(['th','tr'])
for row in rows[1:]:
cells = row.findChildren('td')
y = 2
cnt_line += 1
if cnt_line == num_line:
for cell in cells:
TABLE_sheet.cell(row=1,column=1).value = table_name[num]
TABLE_sheet.cell(row=2,column=1).value = 'HTML file name'
TABLE_sheet.cell(row=x,column=1).value = fname_1[k]
TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
worksheet.save(filepath)
y = y + 1
x = x + 1
worksheet.save(filepath)
soup2.decompose()
gc.collect()
# ================================================================
My HTML file is exported from the equipment and it does not look like traditional HTML files. It contains only table descriptions. Unfortunately, I cannot modify the structure of the exported HTML file.
</table>
</UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt">
Duration: 0.02 (s)
<br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><a href="#TOC" xmlns:msxsl="urn:schemas-microsoft-com:xslt">Back to top
</a><a name="22" xmlns:msxsl="urn:schemas-microsoft-com:xslt"></a><HR COLOR="#6699CC" SIZE="2" xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big><b>LTE_B1_ESC_APT</b></big></big><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big><b>Status: </b></big><big><b><span style="color:#00CC00">PASS</span></b></big></UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">Run Info</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">
<table border="1">
<tr>
<td bgcolor="#FFFFCC" align="center"><b>Switchpoint Data</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Temp (C)</b></td>
<td bgcolor="#FFFFCC" align="center"><b>RFMode</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Technology</b></td>
<td bgcolor="#FFFFCC" align="center"><b>RF Mode</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Band</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Device</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Instance</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Calibration Mode</b></td>
<td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>
</tr>
<tr>
<td align="center">Streaming not supported</td>
<td align="center">22</td>
<td align="center">34</td>
<td align="center">LTE</td>
<td align="center">34</td>
<td align="center">1</td>
<td align="center">0</td>
<td align="center">0</td>
<td align="center">Composite</td>
<td align="center">2.17</td>
</tr>
</table>
</UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">Segment Lengths</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">
<table border="1">
<tr>
<td bgcolor="#FFFFCC" align="center"><b>Tx Lin Tx Seg Len (ms)</b></td>
<td bgcolor="#FFFFCC" align="center"><b>TxRx Seg Len (ms)</b></td>
<td bgcolor="#FFFFCC" align="center"><b>RF Config Seg Len (ms)</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Tuning Seg Len (ms)</b></td>
<td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>
</tr>
<tr>
<td align="center">4</td>
<td align="center">4</td>
<td align="center">60</td>
<td align="center">20</td>
<td><span style="color:white">-</span></td>
</tr>
</table>
</UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">MEASUREMENTS: Tx Linearizer Sweep</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">
<table border="1">
<tr>
<td bgcolor="#FFFFCC" align="center"><b>Channel</b></td>
<td bgcolor="#FFFFCC" align="center"><b>PA State</b></td>
<td bgcolor="#FFFFCC" align="center"><b>TxAGC</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Vcc</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Icq</b></td>
<td bgcolor="#6699CC" align="center"><b>Power</b></td>
<td bgcolor="#6699CC" align="center"><b>HDET</b></td>
<td bgcolor="#6699CC" align="center"><b>LPM HDET</b></td>
<td bgcolor="#6699CC" align="center"><b>DeltaPwr</b></td>
<td bgcolor="#CCCCCC" align="center"><b>DeltaPwr Min</b></td>
<td bgcolor="#CCCCCC" align="center"><b>DeltaPwr Max</b></td>
<td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>
</tr>
<tr>
<td align="center">18300</td>
<td align="center">1</td>
<td align="center">68</td>
<td align="center">3000</td>
<td align="center">235</td>
<td align="center">26.7</td>
<td align="center">20180</td>
<td align="center">-</td>
<td align="center">0</td>
<td align="center">0</td>
<td align="center">0</td>
<td><span style="color:white">-</span></td>
</tr>
<tr>
<td align="center">18300</td>
<td align="center">1</td>
<td align="center">67</td>
<td align="center">3000</td>
<td align="center">235</td>
<td align="center">26.3</td>
<td align="center">18072</td>
<td align="center">-</td>
<td align="center">0.4</td>
<td align="center">-1</td>
<td align="center">8</td>
<td><span style="color:white">-</span></td>
</tr>
The error when I process more than 15 HTML files:
Exception MemoryError: MemoryError() in <generator object prepare_markup at 0x7CC15A80> ignored
Traceback (most recent call last):
File ".html_parsing_Nov_01_max15_1_submit_to_SOF.py", line 297, in <module>
soup2 = BeautifulSoup(f2, "html.parser")
File "C:Python27libsite-packagesbeautifulsoup4-4.6.3-py2.7.eggbs4__init__.py", line 282, in __init__
self._feed()
File "C:Python27libsite-packagesbeautifulsoup4-4.6.3-py2.7.eggbs4__init__.py", line 343, in _feed
self.builder.feed(self.markup)
MemoryError
I have tested that when I have just parsed the HTML file using BS4 without find_all table, the code can process more than 15 files. I also try to use the BS4 only for the first file and then find the line number of the desired contain, however, I can not find the line number of those lines using BS4.
Would you please take a look and give me your comments as possible?
Thank you very much.
html python-2.7 beautifulsoup out-of-memory
add a comment |
I am programming a python 2.7 script to import multiple HTML local files (each data files ~ 3MB).
I use the BeautifulSoup4 package but I meet the memory error when I import more than 15 HTML files.
My code is:
from bs4 import BeautifulSoup
import os
import xlwt
# ============= FIND THE FIRST HTML FILE ======= #
directory ='C:UserskhoiptDesktopdata_extractionCalib_SMP_100HTMLHTML1'
flag = 0
for filename in os.listdir(directory):
if filename.endswith('.HTML'):
fname_1 = os.path.join(directory,filename)
print('Filename: {}'.format(fname_1))
flag = 1
break
if flag == 0:
print('There is no HTML file')
fname_1_open = open(fname_1)
soup = BeautifulSoup(fname_1_open, "html.parser")
fname_1_open.close()
tables = soup.find_all("table")
# ================================================================
# ========= DECLARE EXCEL FILE ==================================
# Code for create a excel file
# ================================================================
# =========== FIND THE NUMBER OF HTML FILE ========
# Find the number of HTML files and print it to cnt_total_html
# print('The number of HTML files is: ', cnt_total_html)
# =================================================
# ====== Find the number of titles in HTML file =================
# Find the number of titles in HTML file and print it to counter_title
# Each title can contain many tables
# ================================================================
# ==== FIND THE NUMBER OF TABLES IN HTML FILE =======================
# Find the number of tables in HTML file and print it to counter_table
# ====================================================================
# =================================================================
print('Please select the desired title: ')
num = input()
print('The desired title is: ', table_name[num])
print('Please select the order of table: ')
num_o = input()
print('The desired order of table is: ', num_o)
print('Please select the order of line: ')
num_line = input()
print('The desired order of line is: ', num_line)
# ====================================================================
# ================ FIND AND PRINT THE DESIRED LINE TO EXCEL FILE ===========
cnt_total_html_all = cnt_total_html #cnt_total_html is the total html files
Matrix_table = [[0 for x in range(counter_table)] for y in range(cnt_total_html)]
line_no = [None]*counter_title
i = 0
j = 0
k = 0
m = 0
n = 0
flag_cnt_html = 1
x = 2
y = 2
flag_table = 0
flag_line = 0
flag_header = 0
cnt_line = 0
# Seperate the whole HTML files into 10-files block
while cnt_total_html > 10:
cnt_total_html = cnt_total_html - 10
flag_cnt_html += 1
print('So flag_cnt_html la: ',flag_cnt_html)
###################################################
for i in range(flag_cnt_html):
if (cnt_total_html_all - i*10) > 10:
TABLE_sheet=worksheet['TABLE']
for k in range(0+i*10,9+i*10):
f = open(fopen[k])
soup = BeautifulSoup(f, "html.parser")
Matrix_table[k] = soup.find_all("table") #Find all table in the HTML file
f.close()
table_no = [None]*counter_table
table_no =
m = 0
f = open(fopen[k])
line_num = 0
lines = f.readlines()
cnt = 0
for line in lines:
line_num = 1 + line_num
if line.find("table") >= 0:
table_no.append(line_num) #find line number of each table
cnt += 1
m += 1
f.close
start_no = Matrix[k][num] #find line number of the desired title
stop_no = Matrix[k][num+1] #find line number of the sibling desired title
m = 0
flag_table = 0
cnt_line = 0
for m in range(cnt):
if table_no[m] in range(start_no,stop_no):
i1 = (m - 1)/2
mytable = Matrix_table[k][i1]
flag_table += 1
if flag_table == num_o*2: #Check if the order of table in HTML file is the desired table
if flag_header == 0:
rows = mytable.findChildren(['th','tr'])
cells = rows[0].findChildren('td')
y = 2
for cell in cells:
TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
worksheet.save(filepath)
y = y + 1
x = x + 1
flag_header = 1
rows = mytable.findChildren(['th','tr'])
for row in rows[1:]:
cells = row.findChildren('td')
y = 2
cnt_line += 1
if cnt_line == num_line: #Check if the order of line in the table is the desired one
for cell in cells:
TABLE_sheet.cell(row=2,column=1).value = 'HTML file name' #Write to excel file
TABLE_sheet.cell(row=x,column=1).value = fname_1[k]
TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
worksheet.save(filepath)
y = y + 1
x = x + 1
worksheet.save(filepath)
soup.decompose()
gc.collect()
else:
gc.collect()
TABLE_sheet=worksheet['TABLE']
for k in range(0+i*10-1, cnt_total_html_all):
f2 = open(fopen[k])
soup2 = BeautifulSoup(f2, "html.parser")
Matrix_table[k] = soup2.find_all("table")
f2.close()
table_no =
m = 0
line_num = 0
f2 = open(fopen[k])
lines = f2.readlines()
cnt = 0
for line in lines:
line_num = 1 + line_num
if line.find("table") >= 0:
table_no.append(line_num)
cnt += 1
m += 1
f2.close()
start_no = Matrix[k][num]
stop_no = Matrix[k][num+1]
m = 0
flag_table = 0
cnt_line = 0
for m in range(cnt):
if table_no[m] in range(start_no,stop_no):
i1 = (m-1)/2
mytable = Matrix_table[k][i1]
flag_table += 1
if flag_table == num_o*2:
if flag_header == 0:
rows = mytable.findChildren(['th','tr'])
cells = rows[0].findChildren('td')
y = 2
for cell in cells:
TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
worksheet.save(filepath)
y = y + 1
x = x + 1
flag_header = 1
rows = mytable.findChildren(['th','tr'])
for row in rows[1:]:
cells = row.findChildren('td')
y = 2
cnt_line += 1
if cnt_line == num_line:
for cell in cells:
TABLE_sheet.cell(row=1,column=1).value = table_name[num]
TABLE_sheet.cell(row=2,column=1).value = 'HTML file name'
TABLE_sheet.cell(row=x,column=1).value = fname_1[k]
TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
worksheet.save(filepath)
y = y + 1
x = x + 1
worksheet.save(filepath)
soup2.decompose()
gc.collect()
# ================================================================
My HTML file is exported from the equipment and it does not look like traditional HTML files. It contains only table descriptions. Unfortunately, I cannot modify the structure of the exported HTML file.
</table>
</UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt">
Duration: 0.02 (s)
<br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><a href="#TOC" xmlns:msxsl="urn:schemas-microsoft-com:xslt">Back to top
</a><a name="22" xmlns:msxsl="urn:schemas-microsoft-com:xslt"></a><HR COLOR="#6699CC" SIZE="2" xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big><b>LTE_B1_ESC_APT</b></big></big><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big><b>Status: </b></big><big><b><span style="color:#00CC00">PASS</span></b></big></UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">Run Info</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">
<table border="1">
<tr>
<td bgcolor="#FFFFCC" align="center"><b>Switchpoint Data</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Temp (C)</b></td>
<td bgcolor="#FFFFCC" align="center"><b>RFMode</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Technology</b></td>
<td bgcolor="#FFFFCC" align="center"><b>RF Mode</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Band</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Device</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Instance</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Calibration Mode</b></td>
<td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>
</tr>
<tr>
<td align="center">Streaming not supported</td>
<td align="center">22</td>
<td align="center">34</td>
<td align="center">LTE</td>
<td align="center">34</td>
<td align="center">1</td>
<td align="center">0</td>
<td align="center">0</td>
<td align="center">Composite</td>
<td align="center">2.17</td>
</tr>
</table>
</UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">Segment Lengths</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">
<table border="1">
<tr>
<td bgcolor="#FFFFCC" align="center"><b>Tx Lin Tx Seg Len (ms)</b></td>
<td bgcolor="#FFFFCC" align="center"><b>TxRx Seg Len (ms)</b></td>
<td bgcolor="#FFFFCC" align="center"><b>RF Config Seg Len (ms)</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Tuning Seg Len (ms)</b></td>
<td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>
</tr>
<tr>
<td align="center">4</td>
<td align="center">4</td>
<td align="center">60</td>
<td align="center">20</td>
<td><span style="color:white">-</span></td>
</tr>
</table>
</UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">MEASUREMENTS: Tx Linearizer Sweep</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">
<table border="1">
<tr>
<td bgcolor="#FFFFCC" align="center"><b>Channel</b></td>
<td bgcolor="#FFFFCC" align="center"><b>PA State</b></td>
<td bgcolor="#FFFFCC" align="center"><b>TxAGC</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Vcc</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Icq</b></td>
<td bgcolor="#6699CC" align="center"><b>Power</b></td>
<td bgcolor="#6699CC" align="center"><b>HDET</b></td>
<td bgcolor="#6699CC" align="center"><b>LPM HDET</b></td>
<td bgcolor="#6699CC" align="center"><b>DeltaPwr</b></td>
<td bgcolor="#CCCCCC" align="center"><b>DeltaPwr Min</b></td>
<td bgcolor="#CCCCCC" align="center"><b>DeltaPwr Max</b></td>
<td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>
</tr>
<tr>
<td align="center">18300</td>
<td align="center">1</td>
<td align="center">68</td>
<td align="center">3000</td>
<td align="center">235</td>
<td align="center">26.7</td>
<td align="center">20180</td>
<td align="center">-</td>
<td align="center">0</td>
<td align="center">0</td>
<td align="center">0</td>
<td><span style="color:white">-</span></td>
</tr>
<tr>
<td align="center">18300</td>
<td align="center">1</td>
<td align="center">67</td>
<td align="center">3000</td>
<td align="center">235</td>
<td align="center">26.3</td>
<td align="center">18072</td>
<td align="center">-</td>
<td align="center">0.4</td>
<td align="center">-1</td>
<td align="center">8</td>
<td><span style="color:white">-</span></td>
</tr>
The error when I process more than 15 HTML files:
Exception MemoryError: MemoryError() in <generator object prepare_markup at 0x7CC15A80> ignored
Traceback (most recent call last):
File ".html_parsing_Nov_01_max15_1_submit_to_SOF.py", line 297, in <module>
soup2 = BeautifulSoup(f2, "html.parser")
File "C:Python27libsite-packagesbeautifulsoup4-4.6.3-py2.7.eggbs4__init__.py", line 282, in __init__
self._feed()
File "C:Python27libsite-packagesbeautifulsoup4-4.6.3-py2.7.eggbs4__init__.py", line 343, in _feed
self.builder.feed(self.markup)
MemoryError
I have tested that when I have just parsed the HTML file using BS4 without find_all table, the code can process more than 15 files. I also try to use the BS4 only for the first file and then find the line number of the desired contain, however, I can not find the line number of those lines using BS4.
Would you please take a look and give me your comments as possible?
Thank you very much.
html python-2.7 beautifulsoup out-of-memory
add a comment |
I am programming a python 2.7 script to import multiple HTML local files (each data files ~ 3MB).
I use the BeautifulSoup4 package but I meet the memory error when I import more than 15 HTML files.
My code is:
from bs4 import BeautifulSoup
import os
import xlwt
# ============= FIND THE FIRST HTML FILE ======= #
directory ='C:UserskhoiptDesktopdata_extractionCalib_SMP_100HTMLHTML1'
flag = 0
for filename in os.listdir(directory):
if filename.endswith('.HTML'):
fname_1 = os.path.join(directory,filename)
print('Filename: {}'.format(fname_1))
flag = 1
break
if flag == 0:
print('There is no HTML file')
fname_1_open = open(fname_1)
soup = BeautifulSoup(fname_1_open, "html.parser")
fname_1_open.close()
tables = soup.find_all("table")
# ================================================================
# ========= DECLARE EXCEL FILE ==================================
# Code for create a excel file
# ================================================================
# =========== FIND THE NUMBER OF HTML FILE ========
# Find the number of HTML files and print it to cnt_total_html
# print('The number of HTML files is: ', cnt_total_html)
# =================================================
# ====== Find the number of titles in HTML file =================
# Find the number of titles in HTML file and print it to counter_title
# Each title can contain many tables
# ================================================================
# ==== FIND THE NUMBER OF TABLES IN HTML FILE =======================
# Find the number of tables in HTML file and print it to counter_table
# ====================================================================
# =================================================================
print('Please select the desired title: ')
num = input()
print('The desired title is: ', table_name[num])
print('Please select the order of table: ')
num_o = input()
print('The desired order of table is: ', num_o)
print('Please select the order of line: ')
num_line = input()
print('The desired order of line is: ', num_line)
# ====================================================================
# ================ FIND AND PRINT THE DESIRED LINE TO EXCEL FILE ===========
cnt_total_html_all = cnt_total_html #cnt_total_html is the total html files
Matrix_table = [[0 for x in range(counter_table)] for y in range(cnt_total_html)]
line_no = [None]*counter_title
i = 0
j = 0
k = 0
m = 0
n = 0
flag_cnt_html = 1
x = 2
y = 2
flag_table = 0
flag_line = 0
flag_header = 0
cnt_line = 0
# Seperate the whole HTML files into 10-files block
while cnt_total_html > 10:
cnt_total_html = cnt_total_html - 10
flag_cnt_html += 1
print('So flag_cnt_html la: ',flag_cnt_html)
###################################################
for i in range(flag_cnt_html):
if (cnt_total_html_all - i*10) > 10:
TABLE_sheet=worksheet['TABLE']
for k in range(0+i*10,9+i*10):
f = open(fopen[k])
soup = BeautifulSoup(f, "html.parser")
Matrix_table[k] = soup.find_all("table") #Find all table in the HTML file
f.close()
table_no = [None]*counter_table
table_no =
m = 0
f = open(fopen[k])
line_num = 0
lines = f.readlines()
cnt = 0
for line in lines:
line_num = 1 + line_num
if line.find("table") >= 0:
table_no.append(line_num) #find line number of each table
cnt += 1
m += 1
f.close
start_no = Matrix[k][num] #find line number of the desired title
stop_no = Matrix[k][num+1] #find line number of the sibling desired title
m = 0
flag_table = 0
cnt_line = 0
for m in range(cnt):
if table_no[m] in range(start_no,stop_no):
i1 = (m - 1)/2
mytable = Matrix_table[k][i1]
flag_table += 1
if flag_table == num_o*2: #Check if the order of table in HTML file is the desired table
if flag_header == 0:
rows = mytable.findChildren(['th','tr'])
cells = rows[0].findChildren('td')
y = 2
for cell in cells:
TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
worksheet.save(filepath)
y = y + 1
x = x + 1
flag_header = 1
rows = mytable.findChildren(['th','tr'])
for row in rows[1:]:
cells = row.findChildren('td')
y = 2
cnt_line += 1
if cnt_line == num_line: #Check if the order of line in the table is the desired one
for cell in cells:
TABLE_sheet.cell(row=2,column=1).value = 'HTML file name' #Write to excel file
TABLE_sheet.cell(row=x,column=1).value = fname_1[k]
TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
worksheet.save(filepath)
y = y + 1
x = x + 1
worksheet.save(filepath)
soup.decompose()
gc.collect()
else:
gc.collect()
TABLE_sheet=worksheet['TABLE']
for k in range(0+i*10-1, cnt_total_html_all):
f2 = open(fopen[k])
soup2 = BeautifulSoup(f2, "html.parser")
Matrix_table[k] = soup2.find_all("table")
f2.close()
table_no =
m = 0
line_num = 0
f2 = open(fopen[k])
lines = f2.readlines()
cnt = 0
for line in lines:
line_num = 1 + line_num
if line.find("table") >= 0:
table_no.append(line_num)
cnt += 1
m += 1
f2.close()
start_no = Matrix[k][num]
stop_no = Matrix[k][num+1]
m = 0
flag_table = 0
cnt_line = 0
for m in range(cnt):
if table_no[m] in range(start_no,stop_no):
i1 = (m-1)/2
mytable = Matrix_table[k][i1]
flag_table += 1
if flag_table == num_o*2:
if flag_header == 0:
rows = mytable.findChildren(['th','tr'])
cells = rows[0].findChildren('td')
y = 2
for cell in cells:
TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
worksheet.save(filepath)
y = y + 1
x = x + 1
flag_header = 1
rows = mytable.findChildren(['th','tr'])
for row in rows[1:]:
cells = row.findChildren('td')
y = 2
cnt_line += 1
if cnt_line == num_line:
for cell in cells:
TABLE_sheet.cell(row=1,column=1).value = table_name[num]
TABLE_sheet.cell(row=2,column=1).value = 'HTML file name'
TABLE_sheet.cell(row=x,column=1).value = fname_1[k]
TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
worksheet.save(filepath)
y = y + 1
x = x + 1
worksheet.save(filepath)
soup2.decompose()
gc.collect()
# ================================================================
My HTML file is exported from the equipment and it does not look like traditional HTML files. It contains only table descriptions. Unfortunately, I cannot modify the structure of the exported HTML file.
</table>
</UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt">
Duration: 0.02 (s)
<br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><a href="#TOC" xmlns:msxsl="urn:schemas-microsoft-com:xslt">Back to top
</a><a name="22" xmlns:msxsl="urn:schemas-microsoft-com:xslt"></a><HR COLOR="#6699CC" SIZE="2" xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big><b>LTE_B1_ESC_APT</b></big></big><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big><b>Status: </b></big><big><b><span style="color:#00CC00">PASS</span></b></big></UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">Run Info</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">
<table border="1">
<tr>
<td bgcolor="#FFFFCC" align="center"><b>Switchpoint Data</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Temp (C)</b></td>
<td bgcolor="#FFFFCC" align="center"><b>RFMode</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Technology</b></td>
<td bgcolor="#FFFFCC" align="center"><b>RF Mode</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Band</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Device</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Instance</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Calibration Mode</b></td>
<td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>
</tr>
<tr>
<td align="center">Streaming not supported</td>
<td align="center">22</td>
<td align="center">34</td>
<td align="center">LTE</td>
<td align="center">34</td>
<td align="center">1</td>
<td align="center">0</td>
<td align="center">0</td>
<td align="center">Composite</td>
<td align="center">2.17</td>
</tr>
</table>
</UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">Segment Lengths</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">
<table border="1">
<tr>
<td bgcolor="#FFFFCC" align="center"><b>Tx Lin Tx Seg Len (ms)</b></td>
<td bgcolor="#FFFFCC" align="center"><b>TxRx Seg Len (ms)</b></td>
<td bgcolor="#FFFFCC" align="center"><b>RF Config Seg Len (ms)</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Tuning Seg Len (ms)</b></td>
<td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>
</tr>
<tr>
<td align="center">4</td>
<td align="center">4</td>
<td align="center">60</td>
<td align="center">20</td>
<td><span style="color:white">-</span></td>
</tr>
</table>
</UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">MEASUREMENTS: Tx Linearizer Sweep</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">
<table border="1">
<tr>
<td bgcolor="#FFFFCC" align="center"><b>Channel</b></td>
<td bgcolor="#FFFFCC" align="center"><b>PA State</b></td>
<td bgcolor="#FFFFCC" align="center"><b>TxAGC</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Vcc</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Icq</b></td>
<td bgcolor="#6699CC" align="center"><b>Power</b></td>
<td bgcolor="#6699CC" align="center"><b>HDET</b></td>
<td bgcolor="#6699CC" align="center"><b>LPM HDET</b></td>
<td bgcolor="#6699CC" align="center"><b>DeltaPwr</b></td>
<td bgcolor="#CCCCCC" align="center"><b>DeltaPwr Min</b></td>
<td bgcolor="#CCCCCC" align="center"><b>DeltaPwr Max</b></td>
<td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>
</tr>
<tr>
<td align="center">18300</td>
<td align="center">1</td>
<td align="center">68</td>
<td align="center">3000</td>
<td align="center">235</td>
<td align="center">26.7</td>
<td align="center">20180</td>
<td align="center">-</td>
<td align="center">0</td>
<td align="center">0</td>
<td align="center">0</td>
<td><span style="color:white">-</span></td>
</tr>
<tr>
<td align="center">18300</td>
<td align="center">1</td>
<td align="center">67</td>
<td align="center">3000</td>
<td align="center">235</td>
<td align="center">26.3</td>
<td align="center">18072</td>
<td align="center">-</td>
<td align="center">0.4</td>
<td align="center">-1</td>
<td align="center">8</td>
<td><span style="color:white">-</span></td>
</tr>
The error when I process more than 15 HTML files:
Exception MemoryError: MemoryError() in <generator object prepare_markup at 0x7CC15A80> ignored
Traceback (most recent call last):
File ".html_parsing_Nov_01_max15_1_submit_to_SOF.py", line 297, in <module>
soup2 = BeautifulSoup(f2, "html.parser")
File "C:Python27libsite-packagesbeautifulsoup4-4.6.3-py2.7.eggbs4__init__.py", line 282, in __init__
self._feed()
File "C:Python27libsite-packagesbeautifulsoup4-4.6.3-py2.7.eggbs4__init__.py", line 343, in _feed
self.builder.feed(self.markup)
MemoryError
I have tested that when I have just parsed the HTML file using BS4 without find_all table, the code can process more than 15 files. I also try to use the BS4 only for the first file and then find the line number of the desired contain, however, I can not find the line number of those lines using BS4.
Would you please take a look and give me your comments as possible?
Thank you very much.
html python-2.7 beautifulsoup out-of-memory
I am programming a python 2.7 script to import multiple HTML local files (each data files ~ 3MB).
I use the BeautifulSoup4 package but I meet the memory error when I import more than 15 HTML files.
My code is:
from bs4 import BeautifulSoup
import os
import xlwt
# ============= FIND THE FIRST HTML FILE ======= #
directory ='C:UserskhoiptDesktopdata_extractionCalib_SMP_100HTMLHTML1'
flag = 0
for filename in os.listdir(directory):
if filename.endswith('.HTML'):
fname_1 = os.path.join(directory,filename)
print('Filename: {}'.format(fname_1))
flag = 1
break
if flag == 0:
print('There is no HTML file')
fname_1_open = open(fname_1)
soup = BeautifulSoup(fname_1_open, "html.parser")
fname_1_open.close()
tables = soup.find_all("table")
# ================================================================
# ========= DECLARE EXCEL FILE ==================================
# Code for create a excel file
# ================================================================
# =========== FIND THE NUMBER OF HTML FILE ========
# Find the number of HTML files and print it to cnt_total_html
# print('The number of HTML files is: ', cnt_total_html)
# =================================================
# ====== Find the number of titles in HTML file =================
# Find the number of titles in HTML file and print it to counter_title
# Each title can contain many tables
# ================================================================
# ==== FIND THE NUMBER OF TABLES IN HTML FILE =======================
# Find the number of tables in HTML file and print it to counter_table
# ====================================================================
# =================================================================
print('Please select the desired title: ')
num = input()
print('The desired title is: ', table_name[num])
print('Please select the order of table: ')
num_o = input()
print('The desired order of table is: ', num_o)
print('Please select the order of line: ')
num_line = input()
print('The desired order of line is: ', num_line)
# ====================================================================
# ================ FIND AND PRINT THE DESIRED LINE TO EXCEL FILE ===========
cnt_total_html_all = cnt_total_html #cnt_total_html is the total html files
Matrix_table = [[0 for x in range(counter_table)] for y in range(cnt_total_html)]
line_no = [None]*counter_title
i = 0
j = 0
k = 0
m = 0
n = 0
flag_cnt_html = 1
x = 2
y = 2
flag_table = 0
flag_line = 0
flag_header = 0
cnt_line = 0
# Seperate the whole HTML files into 10-files block
while cnt_total_html > 10:
cnt_total_html = cnt_total_html - 10
flag_cnt_html += 1
print('So flag_cnt_html la: ',flag_cnt_html)
###################################################
for i in range(flag_cnt_html):
if (cnt_total_html_all - i*10) > 10:
TABLE_sheet=worksheet['TABLE']
for k in range(0+i*10,9+i*10):
f = open(fopen[k])
soup = BeautifulSoup(f, "html.parser")
Matrix_table[k] = soup.find_all("table") #Find all table in the HTML file
f.close()
table_no = [None]*counter_table
table_no =
m = 0
f = open(fopen[k])
line_num = 0
lines = f.readlines()
cnt = 0
for line in lines:
line_num = 1 + line_num
if line.find("table") >= 0:
table_no.append(line_num) #find line number of each table
cnt += 1
m += 1
f.close
start_no = Matrix[k][num] #find line number of the desired title
stop_no = Matrix[k][num+1] #find line number of the sibling desired title
m = 0
flag_table = 0
cnt_line = 0
for m in range(cnt):
if table_no[m] in range(start_no,stop_no):
i1 = (m - 1)/2
mytable = Matrix_table[k][i1]
flag_table += 1
if flag_table == num_o*2: #Check if the order of table in HTML file is the desired table
if flag_header == 0:
rows = mytable.findChildren(['th','tr'])
cells = rows[0].findChildren('td')
y = 2
for cell in cells:
TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
worksheet.save(filepath)
y = y + 1
x = x + 1
flag_header = 1
rows = mytable.findChildren(['th','tr'])
for row in rows[1:]:
cells = row.findChildren('td')
y = 2
cnt_line += 1
if cnt_line == num_line: #Check if the order of line in the table is the desired one
for cell in cells:
TABLE_sheet.cell(row=2,column=1).value = 'HTML file name' #Write to excel file
TABLE_sheet.cell(row=x,column=1).value = fname_1[k]
TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
worksheet.save(filepath)
y = y + 1
x = x + 1
worksheet.save(filepath)
soup.decompose()
gc.collect()
else:
gc.collect()
TABLE_sheet=worksheet['TABLE']
for k in range(0+i*10-1, cnt_total_html_all):
f2 = open(fopen[k])
soup2 = BeautifulSoup(f2, "html.parser")
Matrix_table[k] = soup2.find_all("table")
f2.close()
table_no =
m = 0
line_num = 0
f2 = open(fopen[k])
lines = f2.readlines()
cnt = 0
for line in lines:
line_num = 1 + line_num
if line.find("table") >= 0:
table_no.append(line_num)
cnt += 1
m += 1
f2.close()
start_no = Matrix[k][num]
stop_no = Matrix[k][num+1]
m = 0
flag_table = 0
cnt_line = 0
for m in range(cnt):
if table_no[m] in range(start_no,stop_no):
i1 = (m-1)/2
mytable = Matrix_table[k][i1]
flag_table += 1
if flag_table == num_o*2:
if flag_header == 0:
rows = mytable.findChildren(['th','tr'])
cells = rows[0].findChildren('td')
y = 2
for cell in cells:
TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
worksheet.save(filepath)
y = y + 1
x = x + 1
flag_header = 1
rows = mytable.findChildren(['th','tr'])
for row in rows[1:]:
cells = row.findChildren('td')
y = 2
cnt_line += 1
if cnt_line == num_line:
for cell in cells:
TABLE_sheet.cell(row=1,column=1).value = table_name[num]
TABLE_sheet.cell(row=2,column=1).value = 'HTML file name'
TABLE_sheet.cell(row=x,column=1).value = fname_1[k]
TABLE_sheet.cell(row=x, column=y).value = cell.get_text()
worksheet.save(filepath)
y = y + 1
x = x + 1
worksheet.save(filepath)
soup2.decompose()
gc.collect()
# ================================================================
My HTML file is exported from the equipment and it does not look like traditional HTML files. It contains only table descriptions. Unfortunately, I cannot modify the structure of the exported HTML file.
</table>
</UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt">
Duration: 0.02 (s)
<br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><a href="#TOC" xmlns:msxsl="urn:schemas-microsoft-com:xslt">Back to top
</a><a name="22" xmlns:msxsl="urn:schemas-microsoft-com:xslt"></a><HR COLOR="#6699CC" SIZE="2" xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big><b>LTE_B1_ESC_APT</b></big></big><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt"><big><b>Status: </b></big><big><b><span style="color:#00CC00">PASS</span></b></big></UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">Run Info</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">
<table border="1">
<tr>
<td bgcolor="#FFFFCC" align="center"><b>Switchpoint Data</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Temp (C)</b></td>
<td bgcolor="#FFFFCC" align="center"><b>RFMode</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Technology</b></td>
<td bgcolor="#FFFFCC" align="center"><b>RF Mode</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Band</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Device</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Instance</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Calibration Mode</b></td>
<td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>
</tr>
<tr>
<td align="center">Streaming not supported</td>
<td align="center">22</td>
<td align="center">34</td>
<td align="center">LTE</td>
<td align="center">34</td>
<td align="center">1</td>
<td align="center">0</td>
<td align="center">0</td>
<td align="center">Composite</td>
<td align="center">2.17</td>
</tr>
</table>
</UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">Segment Lengths</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">
<table border="1">
<tr>
<td bgcolor="#FFFFCC" align="center"><b>Tx Lin Tx Seg Len (ms)</b></td>
<td bgcolor="#FFFFCC" align="center"><b>TxRx Seg Len (ms)</b></td>
<td bgcolor="#FFFFCC" align="center"><b>RF Config Seg Len (ms)</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Tuning Seg Len (ms)</b></td>
<td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>
</tr>
<tr>
<td align="center">4</td>
<td align="center">4</td>
<td align="center">60</td>
<td align="center">20</td>
<td><span style="color:white">-</span></td>
</tr>
</table>
</UL><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><br xmlns:msxsl="urn:schemas-microsoft-com:xslt"><b xmlns:msxsl="urn:schemas-microsoft-com:xslt">MEASUREMENTS: Tx Linearizer Sweep</b><UL xmlns:msxsl="urn:schemas-microsoft-com:xslt">
<table border="1">
<tr>
<td bgcolor="#FFFFCC" align="center"><b>Channel</b></td>
<td bgcolor="#FFFFCC" align="center"><b>PA State</b></td>
<td bgcolor="#FFFFCC" align="center"><b>TxAGC</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Vcc</b></td>
<td bgcolor="#FFFFCC" align="center"><b>Icq</b></td>
<td bgcolor="#6699CC" align="center"><b>Power</b></td>
<td bgcolor="#6699CC" align="center"><b>HDET</b></td>
<td bgcolor="#6699CC" align="center"><b>LPM HDET</b></td>
<td bgcolor="#6699CC" align="center"><b>DeltaPwr</b></td>
<td bgcolor="#CCCCCC" align="center"><b>DeltaPwr Min</b></td>
<td bgcolor="#CCCCCC" align="center"><b>DeltaPwr Max</b></td>
<td bgcolor="#EEEEE0" align="center"><b>Time (s)</b></td>
</tr>
<tr>
<td align="center">18300</td>
<td align="center">1</td>
<td align="center">68</td>
<td align="center">3000</td>
<td align="center">235</td>
<td align="center">26.7</td>
<td align="center">20180</td>
<td align="center">-</td>
<td align="center">0</td>
<td align="center">0</td>
<td align="center">0</td>
<td><span style="color:white">-</span></td>
</tr>
<tr>
<td align="center">18300</td>
<td align="center">1</td>
<td align="center">67</td>
<td align="center">3000</td>
<td align="center">235</td>
<td align="center">26.3</td>
<td align="center">18072</td>
<td align="center">-</td>
<td align="center">0.4</td>
<td align="center">-1</td>
<td align="center">8</td>
<td><span style="color:white">-</span></td>
</tr>
The error when I process more than 15 HTML files:
Exception MemoryError: MemoryError() in <generator object prepare_markup at 0x7CC15A80> ignored
Traceback (most recent call last):
File ".html_parsing_Nov_01_max15_1_submit_to_SOF.py", line 297, in <module>
soup2 = BeautifulSoup(f2, "html.parser")
File "C:Python27libsite-packagesbeautifulsoup4-4.6.3-py2.7.eggbs4__init__.py", line 282, in __init__
self._feed()
File "C:Python27libsite-packagesbeautifulsoup4-4.6.3-py2.7.eggbs4__init__.py", line 343, in _feed
self.builder.feed(self.markup)
MemoryError
I have tested that when I have just parsed the HTML file using BS4 without find_all table, the code can process more than 15 files. I also try to use the BS4 only for the first file and then find the line number of the desired contain, however, I can not find the line number of those lines using BS4.
Would you please take a look and give me your comments as possible?
Thank you very much.
html python-2.7 beautifulsoup out-of-memory
html python-2.7 beautifulsoup out-of-memory
asked Jan 3 at 2:53
khoiptkhoipt
11
11
add a comment |
add a comment |
0
active
oldest
votes
Your Answer
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f54015736%2fpython-beautifulsoup-html-memory-error-when-import-multiple-html-files%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
0
active
oldest
votes
0
active
oldest
votes
active
oldest
votes
active
oldest
votes
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f54015736%2fpython-beautifulsoup-html-memory-error-when-import-multiple-html-files%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown