/*

     word2html, a simple word to html converter

     Version M1
     For OS/2 and UNIX


     Available at the word2x for OS/2 homepage
     http://www.d3.net/joerg/word2x.html


     Performance: About 10x slower that word2x/EX for OS/2
     
     modeled after ideas in catdoc, word2x, mswordview and laola.
     Can work with word doc version 6,7(untested), 8


     Michael Ritzert (mjr@turbo.su.shuttle.de), Juli, 1998

     Modified by Joerg Klemenz <joerg@gmx.de> September 2, 1998


*/

#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <netinet/in.h> /* used to fiddle with the byte ordering 
			   in a machine independent way */


static char headText [] = "<html>\n<head>\n<title>%s</title>\n</head>\n<body bgcolor=\"#ffffff\">\n" ;
static char footText [] = "\n</body>\n</html>\n" ;
char * head_ptr = headText ;
char * foot_ptr = footText ;

struct s_text {
  unsigned char * start;
  unsigned long len;
} ;


unsigned long vax_long( char *i ) {

  /* not very efficient, but independent from byte ordering */

  /* in fact, VAXen and Intel boxes would not need any conversion
     from their native format to itself at all. Unfortunately, i
     did not found a portable way to determine the endianness 
     of the machine i'm on. As i'd like to avoid #ifdef's with
     possibly compiler dependent flags where the one needed is 
     missing in the list, and because this routine is called very seldom,
     i do the conversion on all systems.
     */
  unsigned int j,k;
  j=i[0] & 0xFF; /* masking is necessary at least for solaris2/sparc/gcc */
  k=i[1] & 0xFF; k<<= 8; j+=k;
  k=i[2] & 0xFF; k<<=16; j+=k;
  k=i[3] & 0xFF; k<<=24; j+=k;
  
  return j;

}

int laola_find_word_doc( unsigned char * s, unsigned int len )
{
  /* find block containing string W.o.r.d.D.o.c.u.m.e.n.t in input stream.
     derived from strindex().
     */
  unsigned int i,j ;
  unsigned char t[] = "WordDocument";
  int start_block;

  /* i <= len-24  --> len -length(W.o.r.d.D.o.c.u.m.e.n.t.) */

  for( i=0; i <= len-24; i++ ) {
    for( j=0; j < 12 && s[i+j+j] == t[j] && s[i+j+j+1] == 0; j++ )
      continue;

    if( j==12 ) {
      start_block = vax_long( s+i+0x74 );
      return start_block;
    }
  }
  return -1;
}


struct s_text find_text_part( unsigned char * in, unsigned int len ) {
  struct s_text result;
  unsigned int laola_word_block, i;
  unsigned char * laola_word_entry =  in;

  if( (laola_word_block = laola_find_word_doc( in, len )) == -1 ) {
    result.start = NULL;
  } 
  else {
    laola_word_entry += (laola_word_block << 9);
    laola_word_entry += 512;
    result.start = laola_word_entry + (i=vax_long( laola_word_entry + 0x18 ));
    result.len = vax_long( laola_word_entry + 0x1c ) - i; 
  }
  return result;
}

int main(int argc, char ** argv)
{

  int fd;
  struct stat sbuf;
  unsigned char * fp ;
  unsigned char * buffer_ptr ;
  int i,j;
  unsigned char * xxx;
  struct s_text text;

  if( argc == 1 ) {
    fprintf( stderr, "Usage: %s [inputfile]\n  output goes to stdout\n\n",
             argv[0] ) ;
    exit (-1) ;
  }

  if( (fd = open( argv[1], O_RDONLY)) < 0 ) {
    fprintf( stderr, "ERROR: Can't open '%s' %s\n", argv[1] , strerror(errno) ) ;
    exit (1) ;
  }

  if( fstat( fd, &sbuf ) == -1 ) {
    fprintf( stderr, "ERROR: Can't fstat '%s' %s\n", argv[1], strerror(errno) ) ;
    exit(1);  
  }

  buffer_ptr = (unsigned char *) malloc (sbuf.st_size) ;

  if ( read (fd, buffer_ptr, sbuf.st_size) == -1 ) {
    fprintf( stderr, "ERROR: Read failed %s \n", argv[1] ) ;
    exit (1) ;
  }

  text = find_text_part( buffer_ptr, sbuf.st_size ) ;

  if( text.start == NULL ) {
    fprintf( stderr, "ERROR: Cannot recognize %s as a MS-Word 6+ document.\n",
	     argv[1] ) ;
    close (fd) ;
    free (buffer_ptr) ;
    exit (2) ;
  }
  
  setbuf( stdout, NULL ) ;

  printf( head_ptr, argv[1] ) ;

  i=0;
  fp = text.start;

  while( i < text.len ) {
   
    switch( (int) fp[i] ) {

    case '\r':    /* special treatment: single \r: <br>, double <r: <p> */
      if( (int)fp[i+1] == '\r' ) {
	i++;
        puts( "</p>\n\n<p>" );
      }
      else if( (int)fp[i+1] == 0 ) 
	goto end;
      else
        puts( "\n<br>" );
      break;

    case 'G'-64:
      puts( "&#09;" );
      break;

    case 'L'-64: /* form feed */
      puts( "\n<br><hr><p>\n" );
      break;

    case 'S'-64: /* special tag */
      if( memcmp( &fp[i+1], " HYPERLINK ", 11 ) == 0 ) {
	puts( "<a href=" ); putc( '\"', stdout );
	i+=12;                                 /* after ctl sequence ident.  */
        xxx = strchr( &fp[i], 'A'-64 ); xxx--; /* point to 0x20 0x01 */
        j=(int)(xxx-fp);                       /* index of end of URL string */
        for( ; i<j; i++ ) putc( fp[i], stdout );
	puts( "\">" );
	i+=3;                    /* skip the 0x20 0x01 and the subsequent ^T */
	xxx=strchr( &fp[i], 'U'-64 );
        j=(int)(xxx-fp);                       /* index of end of ctl string */
        for( ; i<j; i++ ) putc( fp[i], stdout );
	i++;
	puts( "</a>" );
      }
      else {
	xxx=strchr( &fp[i], 'U'-64 );
        i=(int)(xxx-fp)+1;                       /* after end of ctl string  */
      }

      break;
	
    case 0xC4:
      fputs( "&Auml;", stdout );
      break;
    case 0xD6:
      fputs( "&Ouml;", stdout );
      break;
    case 0xDC:
      fputs( "&Uuml;", stdout );
      break;
    case 0xDF:
      fputs( "&szlig;", stdout );
      break;
    case 0xE4:
      fputs( "&auml;", stdout );
      break;
    case 0xF6:
      fputs( "&ouml;", stdout );
      break;
    case 0xFC:
      fputs( "&uuml;", stdout );
      break;

    case 0x1e:
      puts( "-" );
      break;
    case 0x1f:
      puts( "&shy;" );
      break;
    case 0x85:
      puts( "..." );
      break;
    case 0x91:
      putc( '`', stdout );
      break;
    case 0x92:
      putc( '\'', stdout );
      break;
    case 0x84:
      putc( '`', stdout );
      putc( '`', stdout );
      break;
    case 0x93:
      putc( '\'', stdout );
      putc( '\'', stdout );
      break;
    case 0x95: /* bullet */
      puts( "<li>" );
    case 0x96:
      puts( "-" );
      break;
    case 0x97:
      puts( "-" );
      break;
    case 0x99:
      puts( "&reg;" );
      break;
    case 0x9a:
      putc( 's', stdout );
      break;
    case 0xa0:
      puts( "&nbsp;" );
      break;
    case 0xa9:
      puts( "&copy;" );
      break;
    case 0xae:
      puts( "&reg;" );
      break;
    case 0xab:
      puts( "&laquo;" );
      break;
    case 0xbb:
      puts( "&raquo;" );
      break;
    case '<':
      puts( "&lt;" );
      break;
    case '>':
      puts( "&gt;" );
      break;
    case '&':
      puts( "&amp;" );
      break;
    case 255:
      break;
    default:
      if( (int) fp[i] >= ' ' )
	putc( (int) fp[i], stdout );
      break;
    }
    i++;
  }
end:

  printf( foot_ptr ) ; 
  close (fd) ;
  free (buffer_ptr) ;
  exit (0) ;
}
