92 lines
2.8 KiB
Python
92 lines
2.8 KiB
Python
from mwclient import Site, APIError
|
|
from mwclient.page import Page
|
|
import argparse, json, sys
|
|
|
|
import unidecode
|
|
# unaccented_string = unidecode.unidecode(accented_string)
|
|
|
|
ap = argparse.ArgumentParser("")
|
|
ap.add_argument("--wikiprotocol", default="http")
|
|
ap.add_argument("--wikihost", default="localhost")
|
|
ap.add_argument("--wikipath", default="/mw/")
|
|
ap.add_argument("--limit", default=None)
|
|
ap.add_argument("--output", default=sys.stdout, type=argparse.FileType('w'))
|
|
# ap.add_argument("--user", default=None)
|
|
# ap.add_argument("--password", default=None)
|
|
args = ap.parse_args()
|
|
|
|
site = Site((args.wikiprotocol, args.wikihost), path=args.wikipath)
|
|
|
|
def category_subcats (site, cattitle, objects=True):
|
|
cmcontinue = None
|
|
ret = []
|
|
while True:
|
|
if cmcontinue == None:
|
|
resp = site.api("query", list="categorymembers", cmtitle=cattitle, cmtype="subcat", cmlimit=50)
|
|
else:
|
|
resp = site.api("query", list="categorymembers", cmtitle=cattitle, cmtype="subcat", cmlimit=50, cmcontinue=cmcontinue)
|
|
ret.extend([x['title'] for x in resp['query']['categorymembers']])
|
|
if 'continue' in resp:
|
|
cmcontinue = resp['continue']['cmcontinue']
|
|
else:
|
|
break
|
|
if objects:
|
|
# print "converting to page objects ({0})".format(len(ret))
|
|
ret = [site.pages[x] for x in ret]
|
|
return ret
|
|
|
|
def strip_namespace (x):
|
|
if ":" in x:
|
|
return x.split(":", 1)[1]
|
|
return x
|
|
|
|
cats = list(site.allcategories())
|
|
cats.sort(key=lambda x: unidecode.unidecode(x.name))
|
|
# cats_by_name = {}
|
|
cats = [{ 'title': cat.page_title, 'name': cat.name } for cat in cats]
|
|
index = {}
|
|
for c in cats:
|
|
index[c['title']] = c
|
|
|
|
for c in cats:
|
|
sc = category_subcats(site, c['name'], objects=False)
|
|
sc = [strip_namespace(x) for x in sc]
|
|
if sc:
|
|
for subcat in sc:
|
|
subcat = index[subcat]
|
|
if 'parent' not in subcat:
|
|
subcat['parent'] = c
|
|
|
|
def make_hierarchy (items):
|
|
root = {'children': []}
|
|
for item in items:
|
|
if 'parent' in item:
|
|
if 'children' not in item['parent']:
|
|
item['parent']['children'] = []
|
|
item['parent']['children'].append(item)
|
|
item['parent'] = item['parent']['title']
|
|
# del item['parent']
|
|
else:
|
|
root['children'].append(item)
|
|
return root
|
|
|
|
def flatten (root, depth=0):
|
|
for x in root['children']:
|
|
r = {'title': x['title'], 'name': x['name'], 'depth': depth}
|
|
if 'parent' in x:
|
|
r['parent'] = x['parent']
|
|
yield(r)
|
|
if 'children' in x:
|
|
for item in flatten(x, depth+1):
|
|
yield item
|
|
|
|
import json
|
|
root = make_hierarchy(cats)
|
|
#print (json.dumps(root, indent=2), file=args.output)
|
|
|
|
flat = list(flatten(root))
|
|
print (json.dumps(flat, indent=2), file=args.output)
|
|
|
|
|
|
|