我试图解析以下 xml 并围绕我的业务需求获取我感兴趣的特定标签。我想我做错了什么。不知道如何解析我所需的标签?想要利用 pandas,以便我可以进一步过滤细节。学弟学妹们多多支持
我的 XML 来自 URI
<couponfeed>
<TotalMatches>1459</TotalMatches>
<TotalPages>3</TotalPages>
<PageNumberRequested>1</PageNumberRequested>
<link type="TEXT">
<categories>
<category id="1">Apparel</category>
</categories>
<promotiontypes>
<promotiontype id="11">Percentage off</promotiontype>
</promotiontypes>
<offerdescription>25% Off Boys Quiksilver Apparel. Shop now at Macys.com! Valid 7/23 through 7/25!</offerdescription>
<offerstartdate>2020-07-24</offerstartdate>
<offerenddate>2020-07-26</offerenddate>
<clickurl>https://click.synergy.com/fs-bin/click?id=Z&offerid=777210.100474694&type=3&subid=0</clickurl>
<impressionpixel>https://ad.synergy.com/fs-bin/show?id=ZNAweM&bids=777210.100474694&type=3&subid=0</impressionpixel>
<advertiserid>3184</advertiserid>
<advertisername>cys.com</advertisername>
<network id="1">US Network</network>
</link>
<link type="TEXT">
<categories>
<category id="1">Apparel</category>
</categories>
<promotiontypes>
<promotiontype id="11">Percentage off</promotiontype>
</promotiontypes>
<offerdescription>25% Off Boys' Quiksilver Apparel. Shop now at Macys.com! Valid 7/23 through 7/25!</offerdescription>
<offerstartdate>2020-07-24</offerstartdate>
<offerenddate>2020-07-26</offerenddate>
<clickurl>https://click.synergy.com/fs-bin/click?id=ZZvk49eM&offerid=777210.100474695&type=3&subid=0</clickurl>
<impressionpixel>https://ad.synergy.com/fs-bin/show?id=ZZvk49NAwbids=777210.100474695&type=3&subid=0</impressionpixel>
<advertiserid>3184</advertiserid>
<advertisername>cys.com</advertisername>
<network id="1">US Network</network>
</link>
My Code
from xml.dom import minidom
import urllib
import pandas as pd
url = "http://couponfeed.synergy.com/coupon?token=xxxxxxxxx122b&network=1&resultsperpage=500"
xmldoc = minidom.parse(urllib.request.urlopen(url))
#itemlist = xmldoc.getElementsByTagName('clickurl')
df_cols = ["promotiontype","category","offerdescription", "offerstartdate", "offerenddate", "clickurl","impressionpixel","advertisername","network"]
rows = []
for entry in xmldoc.couponfeed:
s_promotiontype = couponfeed.get("promotiontype","")
s_category = couponfeed.get("category","")
s_offerdescription = couponfeed.get("offerdescription", "")
s_offerstartdate = couponfeed.get("offerstartdate", "")
s_offerenddate = couponfeed.get("offerenddate", "")
s_clickurl = couponfeed.get("clickurl", "")
s_impressionpixel = couponfeed.get("impressionpixel", "")
s_advertisername = couponfeed.get("advertisername","")
s_network = couponfeed.get ("network","")
rows.append({"promotiontype":s_promotiontype, "category": s_category, "offerdescription": s_offerdescription,
"offerstartdate": s_offerstartdate, "offerenddate": s_offerenddate,"clickurl": s_clickurl,"impressionpixel":s_impressionpixel,
"advertisername": s_advertisername,"network": s_network})
out_df = pd.DataFrame(rows, columns=df_cols)
out_df.to_csv(r"C:\\Users\rai\Downloads\\merchants_offers_share.csv", index=False)
尝试简单的方法但我没有得到任何结果
import lxml.etree as ET
import urllib
response = urllib.request.urlopen('http://couponfeed.synergy.com/coupon?token=xxxxxd39f4e5fe392a25538bb122b&network=1&resultsperpage=500')
xml = response.read()
root = ET.fromstring(xml)
for item in root.findall('.//item'):
title = item.find('category').text
print (title)
另一次尝试
from lxml import etree
import pandas as pd
import urllib
url = "http://couponfeed.synergy.com/coupon?token=xxxxxxd39f4e5fe392a25538bb122b&network=1&resultsperpage=500"
xtree = etree.parse(urllib.request.urlopen(url))
for value in xtree.xpath("/root/couponfeed/categories"):
print(value.text)