This commit is contained in:
Michael Murtaugh
2019-05-27 16:05:50 +02:00
commit b027ca341a
29 changed files with 17864 additions and 0 deletions

105
scripts/buildsitemap.py Normal file
View File

@@ -0,0 +1,105 @@
from mwclient import Site, APIError
from mwclient.page import Page
import argparse, json, sys
ap = argparse.ArgumentParser("")
ap.add_argument("--wikiprotocol", default="http")
ap.add_argument("--wikihost", default="localhost")
ap.add_argument("--wikipath", default="/mw/")
ap.add_argument("--limit", default=None)
ap.add_argument("--output", default=sys.stdout, type=argparse.FileType('w'))
# ap.add_argument("--user", default=None)
# ap.add_argument("--password", default=None)
args = ap.parse_args()
site = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
def category_subcats (site, cattitle, objects=True):
cmcontinue = None
ret = []
while True:
if cmcontinue == None:
resp = site.api("query", list="categorymembers", cmtitle=cattitle, cmtype="subcat", cmlimit=50)
else:
resp = site.api("query", list="categorymembers", cmtitle=cattitle, cmtype="subcat", cmlimit=50, cmcontinue=cmcontinue)
ret.extend([x['title'] for x in resp['query']['categorymembers']])
if 'continue' in resp:
cmcontinue = resp['continue']['cmcontinue']
else:
break
if objects:
# print "converting to page objects ({0})".format(len(ret))
ret = [site.pages[x] for x in ret]
return ret
"""
>>> c.page_title
'Équipes'
>>> c.name
'Catégorie:Équipes'
"""
"""
graph = {
nodes: [
{name: "Bienvenue"}
]
links: [
{source: "name", target: "name2"}
]
}
"""
redirects = {}
pages = []
count = 0
all_links = set()
page_exists = {}
def resolve (x):
while x in redirects:
x = redirects[x]
return x
print ("Pass 1")
for p in site.allpages():
r = p.redirects_to()
if (r):
redirects[p.name] = r.name
else:
page_exists[p.name] = True
print ("Pass 2")
# pages_by_title = {}
for p in site.allpages():
if (p.name in redirects):
continue
pd = {}
pd['title'] = p.page_title
pd['ns'] = 0
print ("Page {0}".format(p.name), file=sys.stderr)
# categories
cats = [c.page_title for c in p.categories()]
pd['cats'] = cats
# links
links = [x for x in p.links() if x.namespace == 0]
links = [resolve(x.name) for x in links]
links = [x for x in links if x in page_exists]
for l in links:
if p.name < l:
link = (p.name, l)
else:
link = (l, p.name)
all_links.add(link)
pages.append(pd)
count += 1
if args.limit and count >= args.limit:
break
graph = {}
graph['nodes'] = pages
graph['links'] = [{'source': a, 'target': b} for a, b in all_links]
graph['redirects'] = redirects
print (json.dumps(graph, indent=2), file=args.output)