from mwclient import Site, APIError from mwclient.page import Page import argparse, json, sys import unidecode # unaccented_string = unidecode.unidecode(accented_string) ap = argparse.ArgumentParser("") ap.add_argument("--wikiprotocol", default="http") ap.add_argument("--wikihost", default="localhost") ap.add_argument("--wikipath", default="/mw/") ap.add_argument("--limit", default=None) ap.add_argument("--output", default=sys.stdout, type=argparse.FileType('w')) # ap.add_argument("--user", default=None) # ap.add_argument("--password", default=None) args = ap.parse_args() site = Site((args.wikiprotocol, args.wikihost), path=args.wikipath) def category_subcats (site, cattitle, objects=True): cmcontinue = None ret = [] while True: if cmcontinue == None: resp = site.api("query", list="categorymembers", cmtitle=cattitle, cmtype="subcat", cmlimit=50) else: resp = site.api("query", list="categorymembers", cmtitle=cattitle, cmtype="subcat", cmlimit=50, cmcontinue=cmcontinue) ret.extend([x['title'] for x in resp['query']['categorymembers']]) if 'continue' in resp: cmcontinue = resp['continue']['cmcontinue'] else: break if objects: # print "converting to page objects ({0})".format(len(ret)) ret = [site.pages[x] for x in ret] return ret def strip_namespace (x): if ":" in x: return x.split(":", 1)[1] return x cats = list(site.allcategories()) cats.sort(key=lambda x: unidecode.unidecode(x.name)) # cats_by_name = {} cats = [{ 'title': cat.page_title, 'name': cat.name } for cat in cats] index = {} for c in cats: index[c['title']] = c for c in cats: sc = category_subcats(site, c['name'], objects=False) sc = [strip_namespace(x) for x in sc] if sc: for subcat in sc: subcat = index[subcat] if 'parent' not in subcat: subcat['parent'] = c def make_hierarchy (items): root = {'children': []} for item in items: if 'parent' in item: if 'children' not in item['parent']: item['parent']['children'] = [] item['parent']['children'].append(item) item['parent'] = item['parent']['title'] # del item['parent'] else: root['children'].append(item) return root def flatten (root, depth=0): for x in root['children']: r = {'title': x['title'], 'name': x['name'], 'depth': depth} if 'parent' in x: r['parent'] = x['parent'] yield(r) if 'children' in x: for item in flatten(x, depth+1): yield item import json root = make_hierarchy(cats) #print (json.dumps(root, indent=2), file=args.output) flat = list(flatten(root)) print (json.dumps(flat, indent=2), file=args.output)