bachelor-thesis/thesis/checkbiw/contrib/diction/sentence.c

/* Notes */ /*{{{C}}}*//*{{{*/
/*

This program is GNU software, copyright 1997, 1998, 1999, 2000, 2001
Michael Haardt <michael@moria.de>.

This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
Free Software Foundation; either version 2 of the License, or (at your
option) any later version.

This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

You should have received a copy of the GNU General Public License along
with this program.  If not, write to the Free Software Foundation, Inc.,
59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

*/
/*}}}*/
/* #includes */ /*{{{*/
#undef  _POSIX_SOURCE
#define _POSIX_SOURCE   1
#undef  _POSIX_C_SOURCE
#define _POSIX_C_SOURCE 2

#include "config.h"

#include <sys/types.h>
#include <assert.h>
#include <ctype.h>
#include <errno.h>
#include <limits.h>
#include <locale.h>
#ifdef HAVE_GETTEXT
#include <libintl.h>
#define _(String) gettext(String)
#else
#define _(String) String
#endif
#include <regex.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "misc.h"
#include "sentence.h"
/*}}}*/

static const char *abbreviations_de[]= /*{{{*/
{
  "Dr",
  "bzw",
  "etc",
  "sog",
  "usw",
  (const char*)0
};
/*}}}*/
static const char *abbreviations_en[]= /*{{{*/
{
  "ch",
  "Ch",
  "ckts",
  "dB",
  "Dept",
  "dept",
  "Depts",
  "depts",
  "Dr",
  "Drs",
  "Eq",
  "eq",
  "etc",
  "et al",
  "Fig",
  "fig",
  "Figs",
  "figs",
  "ft",
  "0 in",
  "1 in",
  "2 in",
  "3 in",
  "4 in",
  "5 in",
  "6 in",
  "7 in",
  "8 in",
  "9 in",
  "Inc",
  "Jr",
  "jr",
  "mi",
  "Mr",
  "Mrs",
  "Ms",
  "No",
  "no",
  "Nos",
  "nos",
  "Ph",
  "Ref",
  "ref",
  "Refs",
  "refs",
  "St",
  "vs",
  "yr",
  (const char*)0
};
/*}}}*/
static const char *abbreviations_none[]= /*{{{*/
{
  (const char*)0
};
/*}}}*/
static const char **abbreviations;

static int endingInAbbrev(const char *s, size_t length, const char *lang) /*{{{*/
{
  const char **abbrev=abbreviations;
  size_t aLength;

  if (!isalpha(s[length-1])) return 0;
  if (endingInPossesiveS(s,length)) return 0;
  while (*abbrev!=(const char*)0)
  {
    if ((aLength=strlen(*abbrev))<length)
    {
      if (!isalpha(s[length-2])) return 1;
      if (!isalpha(s[length-aLength-1]) && strncmp(s+length-aLength,*abbrev,aLength)==0) return 1;
    }
    else
    {
      if (length==1) return 1;
      if (aLength==length && strncmp(s,*abbrev,aLength)==0) return 1;
    }      
    ++abbrev;
  }
  return 0;
}
/*}}}*/

int endingInPossesiveS(const char *s, size_t length) /*{{{*/
{
  return (abbreviations==abbreviations_en && length>=3 && strncmp(s+length-2,"\'s",2)==0);
}
/*}}}*/
void sentence(const char *cmd, FILE *in, const char *file, void (*process)(const char *, size_t, const char *, int), const char *lang) /*{{{*/
{
  /* variables */ /*{{{*/
  int voc,oc,c;
  char *sent=malloc(128);
  size_t length=0,capacity=128;
  int inWhiteSpace=0;
  int inParagraph=0;
  int ellipsis=0;
  int line=1,beginLine=1;
  int err;
  regex_t hashLine;
  char filebuf[_POSIX_PATH_MAX+1];
  /*}}}*/

  if (strncmp(lang,"en",2)==0) abbreviations=abbreviations_en;
  else if (strncmp(lang,"C",1)==0) abbreviations=abbreviations_en;
  else if (strncmp(lang,"de",2)==0) abbreviations=abbreviations_de;
  else abbreviations=abbreviations_none;
  /* compile #line number "file" regular expression */ /*{{{*/
  if ((err=regcomp(&hashLine,"^[ \t]*line[ \t]*\\([0-9][0-9]*\\)[ \t]*\"\\([^\"]*\\)\"",0)))
  {
    char buf[256];
    size_t len=regerror(err,&hashLine,buf,sizeof(buf)-1);
    buf[len]='\0';
    fprintf(stderr,_("%s: internal error, compiling a regular expression failed (%s).\n"),cmd,buf);
    exit(2);
  }
  /*}}}*/
  voc='\n';
  c=getc(in);
  while ((oc=c)!=EOF)
  {
    c=getc(in);
    if (oc=='\n') ++line;
    if (voc=='\n' && oc=='#') /* process cpp style #line, continue */ /*{{{*/
    {
      char buf[_POSIX_PATH_MAX+20];
      regmatch_t found[3];

      buf[0]=c; buf[1]='\0';
      (void)fgets(buf+1,sizeof(buf)-1,in);
      if (regexec(&hashLine,buf,3,found,0)==0) /* #line */ /*{{{*/
      {
        size_t len;

        line=strtol(buf+found[1].rm_so,(char**)0,10)-1;
        len=found[2].rm_eo-found[2].rm_so;
        if (len>_POSIX_PATH_MAX) len=_POSIX_PATH_MAX;
        strncpy(filebuf,buf+found[2].rm_so,len);
        filebuf[len]='\0';
        file=filebuf;
      }
      /*}}}*/
      c='\n';
      continue;
    }
    /*}}}*/
    if (length)
    {
      if (length>=(capacity-1) && (sent=realloc(sent,capacity*=2))==(char*)0)
      {
        fprintf(stderr,_("%s: increasing sentence buffer failed: %s\n"),cmd,strerror(errno));
        exit(2);
      }
      if (isspace(oc))
      {
        if (!inWhiteSpace)
        {
          sent[length++]=' ';
          inWhiteSpace=1;
        }
      }
      else
      {
        sent[length++]=oc;
        if
        (
          (length==3 && strncmp(sent+length-3,"...",3)==0 && (c==EOF || isspace(c)))
          || (length>=4 && strncmp(sent+length-4," ...",4)==0 && (c==EOF || isspace(c)))
        )
        {
          /* omission ellipsis */
          inWhiteSpace=0;
        }
        else if (length>=4 && !isspace(sent[length-4]) && strncmp(sent+length-3,"...",3)==0 && (c==EOF || isspace(c)))
        {
          /* beginning ellipsis */
          char foo;

          foo=sent[length-4];
          sent[length-4]='\0';
          process(sent,length-4,file,beginLine);
          sent[length-4]=foo;
          memmove(sent,sent-4,4);
          length=4;
          inParagraph=0;
          inWhiteSpace=0;
          beginLine=line;
        }
        else if (length>=4 && strncmp(sent+length-4,"...",3)==0 && (c==EOF || isspace(c)))
        {
          /* ending ellipsis */
          if (inWhiteSpace) --length;
          sent[length]='\0';
          process(sent,length,file,beginLine);
          length=0;
        }
        else if ((oc=='.' || oc==':' || oc=='!' || oc=='?') && (c==EOF || isspace(c)) && (oc!='.' || !endingInAbbrev(sent,length,lang)))
        {
          /* end of sentence */
          if (inWhiteSpace) --length;
          sent[length]='\0';
          process(sent,length,file,beginLine);
          length=0;
        }
        else
        {
          /* just a regular character */
          inWhiteSpace=0;
        }
      }
    }
    else if (isupper(oc))
    {
      inParagraph=0;
      sent[length++]=oc;
      inWhiteSpace=0;
      beginLine=line;
    }
    else if (!inParagraph && oc=='\n' && c==oc)
    {
      process("",0,file,line);
      inParagraph=1;
    }
    voc=oc;
  }
  if (!inParagraph) process("",0,file,line);
  regfree(&hashLine);
}
/*}}}*/