//(c) 2007 by Matthias Larisch
//released under GPL
//
//Converter for wikipedia dumps to mediawikiviewer file format
//creates .wwa with gz compressed articles
//.wwr with redirect list
//.wwt with article title index
//btcreate from freqmod is required to generate .wwi file!
//this software is a rewrite of freqmods ruby converter.
//the idea and some regular expressions are taken from him!


#include <stdio.h>
#include <stdlib.h>
#include <pcre.h>
//#include <time.h>
//#include <stdint.h>
//#include <limits.h>
#include <string.h>
#include <zlib.h>
#include "xmlconv.h"
#include <fcntl.h>


FILE *titlefile;
FILE *redirectfile;
FILE *outfile;
int outfilenum;
char *outfilename;
pcre *regex_redirect;
int ovector_temp[30];
int matchcount_temp;
char *copybuffer;
int copybufferlen;

article_header buf_article_header;
title_entry buf_title_entry;


void strreverse(char* begin, char* end) {

	char aux;

	while(end>begin)

		aux=*end, *end--=*begin, *begin++=aux;

}

void itoa(int value, char* str, int base) {

	static char num[] = "0123456789abcdefghijklmnopqrstuvwxyz";

	char* wstr=str;

	int sign;



	// Validate base

	if (base<2 || base>35){ *wstr='\0'; return; }



	// Take care of sign

	if ((sign=value) < 0) value = -value;



	// Conversion. Number is reversed.

	do *wstr++ = num[value%base]; while(value/=base);

	if(sign<0) *wstr++='-';

	*wstr='\0';



	// Reverse string

	strreverse(str,wstr-1);

}

void insert_redirect(char redirect_from[255], char redirect_to[255])
{
	//printf("redirect from %s to %s\n",redirect_from,redirect_to);
	unsigned int buffer;
	buffer=strlen(redirect_from);
	fwrite(&buffer, sizeof(unsigned int), 1, redirectfile);
	buffer=strlen(redirect_to);
	fwrite(&buffer, sizeof(unsigned int), 1, redirectfile);
	fputs(redirect_from,redirectfile);
	fputs(redirect_to,redirectfile);
}

int parse_article(char *new_article, char *article, int article_length)
{
	//First, convert all Links to Wiki-Markup.
	//MARKUP_STARTLINK + LINK + G-MODE + OPTIONAL_NAME MARKUP_ENDLINK
	//Wiki: [[Linktarget|OptionalLinkName]]
	//BOLD:
	//MARKUP_STARTBOLD + TEXT + MARKUP_ENDBOLD
	//Wiki: '''Text'''
	//Italic: MARKUP_STARTITALIC/ENDITALIC
	//Wiki: ''Text''
	//Bold & Italic: MARKUP_STARTBOLD MARKUP_STARTITALIC TEXT MARKUP_ENDITALIC MARKUP_ENDBOLD
	//Wiki: '''''Text'''''

	//printf("Article:\n%s",article);
	int art_pos=0;			//Helper article pointers -> always point to actual position in string
	int new_art_pos=0;

	int in_link=0;
	int in_link_name=0;
	int in_underline=0;
	int in_bold=0;
	int in_italic=0;
	int in_htmlcomment=0;
	while(article_length>=art_pos)			//Run loop until we reached article_length (so last run should read zero)
	{
		//printf("While %d\n",art_pos);
		//if(article[art_pos]==0) printf("Read zero, len:%d, pos:%d\n",article_length,art_pos);  //Just for testing...
		if(article_length>art_pos && article[art_pos]==0) printf("Read zero but end of string not reached, %d", art_pos);
		if(article[art_pos]=='[' && article[art_pos+1]=='[' && article[art_pos+2]!='[')  //Linkstart
		{
			in_link=1;
			in_link_name=0;
			new_article[new_art_pos]=MARKUP_STARTLINK;
			new_art_pos++;
			art_pos+=2;
			continue;
		}
		if(in_link==1)
		{
			if(article[art_pos]==']' && article[art_pos+1]==']' && article[art_pos+2]!=']')
			{
				in_link=0;
				in_link_name=0;
				new_article[new_art_pos]=MARKUP_ENDLINK;
				new_art_pos++;
				art_pos+=2;
				continue;
			}
			if(article[art_pos]=='|' && in_link_name==0)
			{
				//Here follows the mww-markup for link-name... G-MODE
				in_link_name=1;
				new_article[new_art_pos++]=MARKUP_MODE;
				art_pos++;
				continue;
			}
		}

		if(article[art_pos]=='\'' && article[art_pos+1]=='\'' && article[art_pos+2]=='\'' && article[art_pos+3]=='\'' && article[art_pos+4]=='\'')
		{
			if(in_bold==in_italic==0){
				in_bold=1;
				in_italic=1;
				new_article[new_art_pos++]=MARKUP_STARTBOLD;
				new_article[new_art_pos++]=MARKUP_STARTITALIC;
			}
			else
			{
				in_bold=0;
				in_italic=0;
				new_article[new_art_pos++]=MARKUP_ENDITALIC;
				new_article[new_art_pos++]=MARKUP_ENDBOLD;
			}
			art_pos+=5;
			continue;
		}

		if(article[art_pos]=='\'' && article[art_pos+1]=='\'' && article[art_pos+2]=='\'')
		{
			if(in_bold==0)
			{
				in_bold=1;
				new_article[new_art_pos++]=MARKUP_STARTBOLD;
			}
			else
			{
				in_bold=0;
				new_article[new_art_pos++]=MARKUP_ENDBOLD;
			}
			art_pos+=3;
			continue;
		}

		if(article[art_pos]=='\'' && article[art_pos+1]=='\'')
		{
			if(in_italic==0)
			{
				in_italic=1;
				new_article[new_art_pos++]=MARKUP_STARTITALIC;
			}
			else
			{
				in_italic=0;
				new_article[new_art_pos++]=MARKUP_ENDITALIC;
			}
			art_pos+=2;
			continue;
		}
		//HTML-Comments: <!-- -->
		//&lt;!-- --&gt;
		if(article[art_pos]=='<' && article[art_pos+1]=='!' && article[art_pos+2]=='-' && article[art_pos+3]=='-')
		{
			in_htmlcomment=1;
			art_pos+=4;
			continue;
		}
		if(article[art_pos]=='&' && article[art_pos+1]=='l' && article[art_pos+2]=='t' && article[art_pos+3]==';' && article[art_pos+4]=='!' && article[art_pos+5]=='-' && article[art_pos+6]=='-')
		{
			in_htmlcomment=1;
			art_pos+=7;
			continue;
		}
		if(in_htmlcomment==1 && article[art_pos]=='-' && article[art_pos+1]=='-' && article[art_pos+2]=='>')
		{
			in_htmlcomment=0;
			art_pos+=3;
			continue;
		}
		if(in_htmlcomment==1 && article[art_pos]=='-' && article[art_pos+1]=='-' && article[art_pos+2]=='&' && article[art_pos+3]=='g' && article[art_pos+4]=='t' && article[art_pos+5]==';')
		{
			in_htmlcomment=0;
			art_pos+=6;
			continue;
		}
		if(in_htmlcomment==1)
		{
			art_pos++;
			continue;
		}
		if(article[art_pos]=='&' && article[art_pos+1]=='l' && article[art_pos+2]=='t' && article[art_pos+3]==';')
		{
			art_pos+=4;
			new_article[new_art_pos++]='<';
			continue;
		}
		if(article[art_pos]=='&' && article[art_pos+1]=='g' && article[art_pos+2]=='t' && article[art_pos+3]==';')
		{
			art_pos+=4;
			new_article[new_art_pos++]='>';
			continue;
		}

		new_article[new_art_pos]=article[art_pos];
		new_art_pos++;
		art_pos++;
	}
	//printf("Artikel:\n%s",new_article);
	return new_art_pos;			//Return string length of new article

}

void do_article(int id, char title[255], char *article, int article_length)
{
	//printf("in doarticle id %d\ntitle %s\ntext %s",id,title,article);
	//printf("articlelength: %d",strlen(article));
	if(article_length<1000)					//Redirect processing, redirects are short in general... speedup :)
	{
		matchcount_temp=pcre_exec(regex_redirect, NULL, article, article_length, 0, PCRE_NOTEMPTY, ovector_temp, 30);
		if(matchcount_temp>1)
		{
			//printf("REDIRECT %s",article);
			char redirect_to[255];
			pcre_copy_substring(article, ovector_temp, matchcount_temp, 1, redirect_to, 255);
			if(strlen(redirect_to)<1) printf("Could not process redirect %s\n",title);
			else
			{
				insert_redirect(title, redirect_to);
				return;
			}
		}
	}
	char *article_parsed=(char *)malloc(article_length*sizeof(char)+1);
	int article_parsed_length=0;
	article_parsed_length=parse_article(article_parsed, article, article_length);							//This function processes the stylistics in the article. We give the
													//pointer as parameter so it changes our article in same memory
	//printf("%s",*article);
	//At this point we should have the correct article in article-string.
	//This needs to be compressed using gzip and then written to the outfile:
	buf_article_header.id=id;
	buf_article_header.article_length=article_parsed_length;
	buf_article_header.title_length=strlen(title);
	if((int)ftell(outfile)+buf_article_header.article_length > 0x40000000)
	{
		//Begin new output file at 1gb
		fclose(outfile);
		open_outfile(1);
	}

	//else open_outfile(0);
	buf_title_entry.filenumber=outfilenum;
	buf_title_entry.fileposition=(int)ftello(outfile);
	buf_title_entry.title_length=buf_article_header.title_length;
	fwrite (&buf_title_entry, sizeof(buf_title_entry), 1, titlefile);
	fwrite (title, sizeof(char), buf_title_entry.title_length, titlefile);
	//printf("diverse pointer: %p %p %p %p %p\n", article, *article, **article, buf_title_entry, &buf_title_entry);

	gzFile *gzipfile;
	FILE *tempfile;
	if((gzipfile=gzopen("temp.gz","w9"))==NULL) perror("gzipfile-open");

	gzwrite(gzipfile, &buf_article_header, sizeof(buf_article_header));
	gzwrite(gzipfile, title, buf_title_entry.title_length);
	gzwrite(gzipfile, article_parsed, article_parsed_length);
	//gzflush(gzipfile, Z_FINISH);
	//gzputs(gzipfile,*article);
	gzclose(gzipfile);
	if((tempfile=fopen("temp.gz","rb"))==NULL)
	{
		printf("Couldn't open temp.gz File, skipping...\n");
		return;
	}

	while(!feof(tempfile))
	{
		copybufferlen=fread(copybuffer, 1, 512, tempfile);
		fwrite(copybuffer, 1, copybufferlen, outfile);
	}
	fclose(tempfile);
	free(article_parsed);										//This is the new article

/*
tempfile=fopen("temp.art","a");
fputs("neuer artikel:\n",tempfile);
fwrite(*article, 1, article_length, tempfile);
fclose(tempfile);
*/

}

int open_outfile(int mode)
{
	char *filename=(char *)malloc(sizeof(char)*(strlen(outfilename)+7));
	strcpy(filename,outfilename);
	if(mode==1)	outfilenum++;
	itoa(outfilenum,filename+strlen(filename),10);
	strcat(filename,".wwa");
	//printf("opening %s\n",filename);
	if(mode==1)
	{
		if((outfile=fopen(filename,"w"))==NULL)
		{
			printf("Error on creating article output file\n");
			return 0;
		}
	}
	else
	{
		if((outfile=fopen(filename,"a"))==NULL)
		{
			printf("Error on reopening article output file\n");
			return 0;
		}
	}
	return 1;
	free(filename);
}


int main(int argc,char * argv[])
{
	printf("Wikipedia XML Converter 0.2\n");
 	if(argc < 3)
 	{
		printf("Help:\nThis program converts wikipedia-xml-dump to rockbox mww format.\n");
		printf("Usage: xmlconv <input-xml-dump> <output-prefix>\n");
		printf("example: xmlconv dewiki.xml meindewiki\n");
		return 0;
	}
	FILE *xmldump;
	printf("working on %s with output prefix %s\n",argv[1], argv[2]);
	outfilename=(char *)malloc((strlen(argv[2])+1)*sizeof(char));
	strcpy(outfilename,argv[2]);
	if (argv[1][0]=='-' && argv[1][1]=='\0') {
	    xmldump = stdin;
	} else if((xmldump=fopen(argv[1],"rb"))==NULL)
	{
		printf("Error on opening input file\n");
		return 1;
	}

	if((titlefile=fopen(strcat(outfilename,".wwt"),"wb"))==NULL)
	{
		printf("Error on opening title output file\n");
		return 1;
	}
	strcpy(outfilename,argv[2]);
	if((redirectfile=fopen(strcat(outfilename,".wwr"),"wb"))==NULL)
	{
		printf("Error on opening redirect output file\n");
		return 1;
	}
	strcpy(outfilename,argv[2]);
	outfilenum=-1;
	if(!open_outfile(1)) return 1;

	copybuffer=(char *)malloc(512);
	long long int dumplen=0, actlen=0;
	if (xmldump != stdin) {
	    fseeko(xmldump,0,SEEK_END);
	    dumplen=ftello(xmldump);
	    fseeko(xmldump,0,SEEK_SET);
	} else {
	    dumplen=-1;
	}
	char zeilenbuffer[S_ROWBUF];
	int i=0;
	const char *error=NULL;
	int erroffset;
	int ovector_open[30], ovector_end[30];
	int matches_open, matches_closed;
	pcre *regex_tagname, *regex_endtag, *regex_endtag_text, *regex_betweentags;
	regex_tagname=pcre_compile("<([a-zA-Z0-9]+)[^>/]*(/)?>", 0, &error, &erroffset, NULL);
	regex_endtag=pcre_compile("</([a-zA-Z0-9]+)[^>]*>", 0, &error, &erroffset, NULL);
	regex_endtag_text=pcre_compile("([^<]*)</[a-zA-Z0-9]+[^>]*>", 0, &error, &erroffset, NULL);
	regex_betweentags=pcre_compile("<([a-zA-Z0-9]+)[^>]*>([^<]*)(</\\1[^>]*>)?", 0, &error, &erroffset, NULL);
	regex_redirect=pcre_compile("(?i)#REDIRECT:? ?\\[\\[([^\\]]*)]]", 0, &error, &erroffset, NULL);
	char tagbuf_o[30];
	char tagbuf_c[30];
	char title[255];
	int id;
	char *article=(char *)malloc(S_ARTBUF*sizeof(char));			//Article Buffer... Hope it's enough
	char buffer[255];
	char *buffer2;
	int k=0;
	int article_length=0; //Optimisation: strlen() is VERY slow on large strings. saves about 70% time
	int mode=TAG_OUTSIDE;
	while (fgets(zeilenbuffer,8191,xmldump)!=0)
	{
		i++;
		if(i%500000==0 && dumplen > 0)
		{
			actlen=ftello(xmldump);
			printf("progress: %Lg percent\n", ((long double)actlen/(long double)dumplen)*100);
			fflush(stdout);
		}
		//if(i>=20000000) break;
		zeilenbuffer[strlen(zeilenbuffer)-1]=0;
		matches_open=pcre_exec(regex_tagname, NULL, zeilenbuffer, strlen(zeilenbuffer), 0, PCRE_NOTEMPTY, ovector_open, 30);
		matches_closed=pcre_exec(regex_endtag, NULL, zeilenbuffer, strlen(zeilenbuffer), 0, PCRE_NOTEMPTY, ovector_end, 30);
		//matches_closed=0;
		//printf("\nread line %s, %d, %d",zeilenbuffer, matches_open, matches_closed);
		if(matches_open>=2)
		{
			pcre_copy_substring(zeilenbuffer, ovector_open, matches_open, 1, tagbuf_o, 30);
			//printf("open: %s",tagbuf_o);
			if(strcasecmp(tagbuf_o, "page")==0)
			{
				//printf("FOUND A PAGE BEGIN\n");
				if(mode!=TAG_OUTSIDE) printf("page begin but not outside before\n");
				mode=TAG_PAGE;
			}
			if(strcasecmp(tagbuf_o, "title")==0 && mode==TAG_PAGE)
			{
				if((matches_open=pcre_exec(regex_betweentags, NULL, zeilenbuffer, strlen(zeilenbuffer), 0, PCRE_NOTEMPTY, ovector_open, 30))<1) printf("title empty\n");
				else
				{
					pcre_copy_substring(zeilenbuffer, ovector_open, matches_open, 2, title, 255);
					//if(i%100==0)printf("title: %s\n", title);
				}
			}
			if(strcasecmp(tagbuf_o, "revision")==0)
			{
				if(mode!=TAG_PAGE) printf("revision but not in page");
				mode=TAG_REVISION;
			}

			if(strcasecmp(tagbuf_o, "id")==0 && mode==TAG_REVISION)
			{
				if((matches_open=pcre_exec(regex_betweentags, NULL, zeilenbuffer, strlen(zeilenbuffer), 0, PCRE_NOTEMPTY, ovector_open, 30))<1) printf("id empty on %s\n", title);
				else
				{
					pcre_copy_substring(zeilenbuffer, ovector_open, matches_open, 2, buffer, 255);
					id=atoi(buffer);
					//printf("ID %d\n", id);
				}
			}

			if(strcasecmp(tagbuf_o, "contributor")==0)
			{
				if(mode!=TAG_REVISION) printf("contributor but not in revision");
				mode=TAG_CONTRIBUTOR;
			}

			if(strcasecmp(tagbuf_o, "text")==0)
			{
				if(mode!=TAG_REVISION){
					printf("text but not in revision");
					if(mode!=TAG_PAGE) printf("and even not in page");
				}
				article[0]=0;							//clear article buffer
				article_length=0;						//MUST synchro..
				if(matches_open!=3) mode=TAG_TEXT;			//When there are 3 matches -> matched <text [...]/>
				else continue;								//When 3 matches -> no text so go on
				if((matches_open=pcre_exec(regex_betweentags, NULL, zeilenbuffer, strlen(zeilenbuffer), 0, 0, ovector_open, 30))<1) printf("text didnt match second time\n");
				else
				{
					pcre_copy_substring(zeilenbuffer, ovector_open, matches_open, 2, article, 8191);
					article_length=strlen(article);
					//printf("Artikel erste zeile: %s\n",article);
				}
				//printf("FOUND TEXT BEGIN");
			}

		}
		else if(mode==TAG_TEXT && matches_closed<2)
		{
			//printf("text going on\n");
			//strcat (article, "\n");			//optimisation: strcat also very slow
			//strcat (article, zeilenbuffer);
			*(article+article_length)='\n';
			strcpy(article+article_length+1,zeilenbuffer);
			article_length+=(strlen(zeilenbuffer)+1);
			if(article_length>=S_ARTBUF){
				printf("Article buffer overflow! Bad sourcefile or increase buffersize\n");
				return 0;
			}
		}


		if(matches_closed==2)
		{
			pcre_copy_substring(zeilenbuffer, ovector_end, matches_closed, 1, tagbuf_c, 30);
			//printf("end: %s",tagbuf_c);
			if(strcasecmp(tagbuf_c, "contributor")==0)
			{
				if(mode!=TAG_CONTRIBUTOR) printf("contributor end but not opened");
				mode=TAG_REVISION;
			}

			if(strcasecmp(tagbuf_c, "revision")==0)
			{
				if(mode!=TAG_REVISION)
				{
					printf("revision end but not opened");
				}
				mode=TAG_PAGE;
			}

			if(strcasecmp(tagbuf_c, "text")==0)
			{
				if(mode!=TAG_TEXT) printf("text end but not opened");
				mode=TAG_REVISION;
				if(matches_open<2)							//Only add this when it is not already added (text opened same line)
				{
					//buffer2=strstr(zeilenbuffer, "</text");
					//strncat(article, zeilenbuffer, strlen(zeilenbuffer)-strlen(buffer));
					//strcat (article, zeilenbuffer);
				}
				do_article(id, title, article, article_length);
				//printf("FOUND TEXT END");
			}

			if(strcasecmp(tagbuf_c, "page")==0)
			{
				if(mode!=TAG_PAGE) printf("page end but not opened");
				mode=TAG_OUTSIDE;
				//printf("FOUND PAGE END");
			}

		}


	}
	free(article);
	free(outfilename);
	free(copybuffer);
	fclose(titlefile);
	fclose(redirectfile);
	fclose(outfile);
	fclose(xmldump);

}
