Install lxml
First you need pip install lxml to install the lxml library.
If you encounter the following error on ubuntu:
#include "libxml/xmlversion.h" compilation terminated. error: command 'x86_64-linux-gnu-gcc' failed with exit status 1 ---------------------------------------- Cleaning up... Removing temporary dir /tmp/pip_build_root... Command /usr/bin/python -c "import setuptools, tokenize;__file__='/tmp/pip_build_root/lxml/setup.py';exec(compile(getattr(tokenize, 'open', open)(__file__).read().replace('\r\n', '\n'), __file__, 'exec'))" install --record /tmp/pip-O4cIn6-record/install-record.txt --single-version-externally-managed --compile failed with error code 1 in /tmp/pip_build_root/lxml Exception information: Traceback (most recent call last): File "/usr/lib/python2.7/dist-packages/pip/basecommand.py", line 122, in main status = self.run(options, args) File "/usr/lib/python2.7/dist-packages/pip/commands/install.py", line 283, in run requirement_set.install(install_options, global_options, root=options.root_path) File "/usr/lib/python2.7/dist-packages/pip/req.py", line 1435, in install requirement.install(install_options, global_options, *args, **kwargs) File "/usr/lib/python2.7/dist-packages/pip/req.py", line 706, in install cwd=self.source_dir, filter_stdout=self._filter_install, show_stdout=False) File "/usr/lib/python2.7/dist-packages/pip/util.py", line 697, in call_subprocess % (command_desc, proc.returncode, cwd)) InstallationError: Command /usr/bin/python -c "import setuptools, tokenize;__file__='/tmp/pip_build_root/lxml/setup.py';exec(compile(getattr(tokenize, 'open', open)(__file__).read().replace('\r\n', '\n'), __file__, 'exec'))" install --record /tmp/pip-O4cIn6-record/install-record.txt --single-version-externally-managed --compile failed with error code 1 in /tmp/pip_build_root/lxml
Please install the following dependencies:
sudo apt-get install libxml2-dev libxslt1-dev
Python code
The following is the code to generate sitemap and sitemapindex indexes. You can pass in the required parameters as required, or add fields:
#!/usr/bin/env python # -*- coding:utf-8 -*- import io import re from lxml import etree def generate_xml(filename, url_list): """Generate a new xml file use url_list""" root = etree.Element('urlset', xmlns="http://www.sitemaps.org/schemas/sitemap/0.9") for each in url_list: url = etree.Element('url') loc = etree.Element('loc') loc.text = each url.append(loc) root.append(url) header = u'<?xml version="1.0" encoding="UTF-8"?>\n' s = etree.tostring(root, encoding='utf-8', pretty_print=True) with io.open(filename, 'w', encoding='utf-8') as f: f.write(unicode(header+s)) def update_xml(filename, url_list): """Add new url_list to origin xml file.""" f = open(filename, 'r') lines = [i.strip() for i in f.readlines()] f.close() old_url_list = [] for each_line in lines: d = re.findall('<loc>(http:\/\/.+)<\/loc>', each_line) old_url_list += d url_list += old_url_list generate_xml(filename, url_list) def generatr_xml_index(filename, sitemap_list, lastmod_list): """Generate sitemap index xml file.""" root = etree.Element('sitemapindex', xmlns="http://www.sitemaps.org/schemas/sitemap/0.9") for each_sitemap, each_lastmod in zip(sitemap_list, lastmod_list): sitemap = etree.Element('sitemap') loc = etree.Element('loc') loc.text = each_sitemap lastmod = etree.Element('lastmod') lastmod.text = each_lastmod sitemap.append(loc) sitemap.append(lastmod) root.append(sitemap) header = u'<?xml version="1.0" encoding="UTF-8"?>\n' s = etree.tostring(root, encoding='utf-8', pretty_print=True) with io.open(filename, 'w', encoding='utf-8') as f: f.write(unicode(header+s)) if __name__ == '__main__': urls = ['http://www.baidu.com'] * 10 mods = ['2004-10-01T18:23:17+00:00'] * 10 generatr_xml_index('index.xml', urls, mods)
Effect
The generated effect should be This format:
sitemap format:
<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <url> <loc>http://www.example.com/foo.html</loc> </url> </urlset>
sitemapindex format:
<?xml version="1.0" encoding="UTF-8"?> <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <sitemap> <loc>http://www.example.com/sitemap1.xml.gz</loc> <lastmod>2004-10-01T18:23:17+00:00</lastmod> </sitemap> <sitemap> <loc>http://www.example.com/sitemap2.xml.gz</loc> <lastmod>2005-01-01</lastmod> </sitemap> </sitemapindex>
lastmod time format problem
The format uses the ISO 8601 standard. If it is a linux/unix system, you can use the following function to obtain
def get_lastmod_time(filename): time_stamp = os.path.getmtime(filename) t = time.localtime(time_stamp) # return time.strftime('%Y-%m-%dT%H:%M:%S+08:00', t) return time.strftime('%Y-%m-%dT%H:%M:%SZ', t)
Optimization
Generally speaking, using lxml is inefficient and takes up a lot of memory. You can create it directly using the write method of the file.
def generate_xml(filename, url_list): with gzip.open(filename,"w") as f: f.write("""<?xml version="1.0" encoding="utf-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n""") for i in url_list: f.write("""<url><loc>%s</loc></url>\n"""%i) f.write("""</urlset>""") def append_xml(filename, url_list): with gzip.open(filename, 'r') as f: for each_line in f: d = re.findall('<loc>(http:\/\/.+)<\/loc>', each_line) url_list.extend(d) generate_xml(filename, set(url_list)) def modify_time(filename): time_stamp = os.path.getmtime(filename) t = time.localtime(time_stamp) return time.strftime('%Y-%m-%dT%H:%M:%S:%SZ', t) def new_xml(filename, url_list): generate_xml(filename, url_list) root = dirname(filename) with open(join(dirname(root), "sitemap.xml"),"w") as f: f.write('<?xml version="1.0" encoding="utf-8"?>\n<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n') for i in glob.glob(join(root,"*.xml.gz")): lastmod = modify_time(i) i = i[len(CONFIG.SITEMAP_PATH):] f.write("<sitemap>\n<loc>http:/%s</loc>\n"%i) f.write("<lastmod>%s</lastmod>\n</sitemap>\n"%lastmod) f.write('</sitemapindex>')
Summary
The above is the entire content of this article. I hope the content of this article can bring some benefits to everyone learning or using python. For help, if you have any questions, you can leave a message to communicate. Thank you for your support to the PHP Chinese website.
For more related articles on how to use Python scripts to generate sitemap.xml, please pay attention to the PHP Chinese website!

JVM'sperformanceiscompetitivewithotherruntimes,offeringabalanceofspeed,safety,andproductivity.1)JVMusesJITcompilationfordynamicoptimizations.2)C offersnativeperformancebutlacksJVM'ssafetyfeatures.3)Pythonisslowerbuteasiertouse.4)JavaScript'sJITisles

JavaachievesplatformindependencethroughtheJavaVirtualMachine(JVM),allowingcodetorunonanyplatformwithaJVM.1)Codeiscompiledintobytecode,notmachine-specificcode.2)BytecodeisinterpretedbytheJVM,enablingcross-platformexecution.3)Developersshouldtestacross

TheJVMisanabstractcomputingmachinecrucialforrunningJavaprogramsduetoitsplatform-independentarchitecture.Itincludes:1)ClassLoaderforloadingclasses,2)RuntimeDataAreafordatastorage,3)ExecutionEnginewithInterpreter,JITCompiler,andGarbageCollectorforbytec

JVMhasacloserelationshipwiththeOSasittranslatesJavabytecodeintomachine-specificinstructions,managesmemory,andhandlesgarbagecollection.ThisrelationshipallowsJavatorunonvariousOSenvironments,butitalsopresentschallengeslikedifferentJVMbehaviorsandOS-spe

Java implementation "write once, run everywhere" is compiled into bytecode and run on a Java virtual machine (JVM). 1) Write Java code and compile it into bytecode. 2) Bytecode runs on any platform with JVM installed. 3) Use Java native interface (JNI) to handle platform-specific functions. Despite challenges such as JVM consistency and the use of platform-specific libraries, WORA greatly improves development efficiency and deployment flexibility.

JavaachievesplatformindependencethroughtheJavaVirtualMachine(JVM),allowingcodetorunondifferentoperatingsystemswithoutmodification.TheJVMcompilesJavacodeintoplatform-independentbytecode,whichittheninterpretsandexecutesonthespecificOS,abstractingawayOS

Javaispowerfulduetoitsplatformindependence,object-orientednature,richstandardlibrary,performancecapabilities,andstrongsecurityfeatures.1)PlatformindependenceallowsapplicationstorunonanydevicesupportingJava.2)Object-orientedprogrammingpromotesmodulara

The top Java functions include: 1) object-oriented programming, supporting polymorphism, improving code flexibility and maintainability; 2) exception handling mechanism, improving code robustness through try-catch-finally blocks; 3) garbage collection, simplifying memory management; 4) generics, enhancing type safety; 5) ambda expressions and functional programming to make the code more concise and expressive; 6) rich standard libraries, providing optimized data structures and algorithms.


Hot AI Tools

Undresser.AI Undress
AI-powered app for creating realistic nude photos

AI Clothes Remover
Online AI tool for removing clothes from photos.

Undress AI Tool
Undress images for free

Clothoff.io
AI clothes remover

Video Face Swap
Swap faces in any video effortlessly with our completely free AI face swap tool!

Hot Article

Hot Tools

DVWA
Damn Vulnerable Web App (DVWA) is a PHP/MySQL web application that is very vulnerable. Its main goals are to be an aid for security professionals to test their skills and tools in a legal environment, to help web developers better understand the process of securing web applications, and to help teachers/students teach/learn in a classroom environment Web application security. The goal of DVWA is to practice some of the most common web vulnerabilities through a simple and straightforward interface, with varying degrees of difficulty. Please note that this software

EditPlus Chinese cracked version
Small size, syntax highlighting, does not support code prompt function

Zend Studio 13.0.1
Powerful PHP integrated development environment

VSCode Windows 64-bit Download
A free and powerful IDE editor launched by Microsoft

Dreamweaver Mac version
Visual web development tools
