This contains my bachelors thesis and associated tex files, code snippets and maybe more. Topic: Data Movement in Heterogeneous Memories with Intel Data Streaming Accelerator
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1001 lines
29 KiB

  1. /* Notes */ /*{{{C}}}*//*{{{*/
  2. /*
  3. This program is GNU software, copyright 1997-2003
  4. Michael Haardt <michael@moria.de>.
  5. This program is free software; you can redistribute it and/or modify it
  6. under the terms of the GNU General Public License as published by the
  7. Free Software Foundation; either version 2 of the License, or (at your
  8. option) any later version.
  9. This program is distributed in the hope that it will be useful, but
  10. WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  11. or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
  12. for more details.
  13. You should have received a copy of the GNU General Public License along
  14. with this program. If not, write to the Free Software Foundation, Inc.,
  15. 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  16. */
  17. /*}}}*/
  18. /* #includes */ /*{{{*/
  19. #undef _POSIX_SOURCE
  20. #define _POSIX_SOURCE 1
  21. #undef _POSIX_C_SOURCE
  22. #define _POSIX_C_SOURCE 2
  23. #include "config.h"
  24. #include <assert.h>
  25. #include <ctype.h>
  26. #include <errno.h>
  27. #include <locale.h>
  28. #ifdef HAVE_GETTEXT
  29. #include <libintl.h>
  30. #define _(String) gettext(String)
  31. #else
  32. #define _(String) String
  33. #endif
  34. #include <math.h>
  35. #include <stdio.h>
  36. #include <stdlib.h>
  37. #include <string.h>
  38. #include "getopt.h"
  39. #include "misc.h"
  40. #include "sentence.h"
  41. /*}}}*/
  42. /* variables */ /*{{{*/
  43. static const char *lc_ctype;
  44. enum lc_ctype_int { ASCII, ISO_8859_1 };
  45. static enum lc_ctype_int lc_ctype_int;
  46. static const char *docLanguage;
  47. static const char *phraseEnd = (const char*)0;
  48. /*}}}*/
  49. /* hit counting functions */ /*{{{*/
  50. struct Hit /*{{{*/
  51. {
  52. int *data;
  53. int capacity;
  54. int size;
  55. };
  56. /*}}}*/
  57. static void newHit(struct Hit *hit) /*{{{*/
  58. {
  59. if ((hit->data=malloc((hit->capacity=3)*sizeof(int)))==(int*)0)
  60. {
  61. fprintf(stderr,_("style: out of memory\n"));
  62. exit(1);
  63. }
  64. memset(hit->data,0,hit->capacity*sizeof(int));
  65. hit->size=0;
  66. }
  67. /*}}}*/
  68. static void noteHit(struct Hit *hit, int n) /*{{{*/
  69. {
  70. assert(n>0);
  71. if (n>hit->capacity)
  72. {
  73. if ((hit->data=realloc(hit->data,n*2*sizeof(int)))==(int*)0)
  74. {
  75. fprintf(stderr,_("style: out of memory\n"));
  76. exit(1);
  77. }
  78. memset(hit->data+hit->capacity,0,(n*2-hit->capacity)*sizeof(int));
  79. hit->capacity=n*2;
  80. }
  81. ++hit->data[n-1];
  82. if (n>hit->size) hit->size=n;
  83. }
  84. /*}}}*/
  85. /*}}}*/
  86. /* readability formulas */ /*{{{*/
  87. /**
  88. * Calculate Kincaid Formula (reading grade).
  89. * @param syllables number of syllables
  90. * @param words number of words
  91. * @param sentences number of sentences
  92. */
  93. static double kincaid(int syllables, int words, int sentences) /*{{{*/
  94. {
  95. return 11.8*(((double)syllables)/words)+0.39*(((double)words)/sentences)-15.59;
  96. }
  97. /*}}}*/
  98. /**
  99. * Calculate Automated Readability Index (reading grade).
  100. * @param letters number of letters
  101. * @param words the number of words
  102. * @param sentences the number of sentences
  103. */
  104. static double ari(int letters, int words, int sentences) /*{{{*/
  105. {
  106. return 4.71*(((double)letters)/words)+0.5*(((double)words)/sentences)-21.43;
  107. }
  108. /*}}}*/
  109. /**
  110. * Calculate Coleman-Liau Formula.
  111. * @param letters number of letters
  112. * @param words the number of words
  113. * @param sentences the number of sentences
  114. */
  115. static double coleman_liau(int letters, int words, int sentences) /*{{{*/
  116. {
  117. return 5.89*(((double)letters)/words)-0.3*(((double)sentences)/(100*words))-15.8;
  118. }
  119. /*}}}*/
  120. /**
  121. * Calculate Flesch reading ease formula.
  122. * @param syllables number of syllables
  123. * @param words number of words
  124. * @param sentences number of sentences
  125. */
  126. static double flesch(int syllables, int words, int sentences) /*{{{*/
  127. {
  128. return 206.835-84.6*(((double)syllables)/words)-1.015*(((double)words)/sentences);
  129. }
  130. /*}}}*/
  131. /**
  132. * Calculate fog index.
  133. * @param words the number of words in the text
  134. * @param bigwords the number of words which contain more than 3 syllables
  135. * @param sentences the number of sentences
  136. */
  137. static double fog(int words, int bigwords, int sentences) /*{{{*/
  138. {
  139. return ((((double)words)/sentences+(100.0*bigwords)/words)*0.4);
  140. }
  141. /*}}}*/
  142. /**
  143. * Calculate 1. neue Wiener Sachtextformel (WSTF). MIGHT BE WRONG!
  144. * @param words the number of words in the text
  145. * @param shortwords the number of words that contain one syllable
  146. * @param longwords the number of words that are longer than 6 characters
  147. * @param bigwords the number of words that contain more than 3 syllables
  148. * @param sentences number of sentences
  149. */
  150. static double wstf(int words, int shortwords, int longwords, int bigwords, int sentences) /*{{{*/
  151. {
  152. return 0.1935*((double)bigwords)/words+0.1672*((double)words)/sentences-0.1297*((double)longwords)/words-0.0327*((double)shortwords)/words-0.875;
  153. }
  154. /*}}}*/
  155. /**
  156. * Calculate Wheeler-Smith formula. MIGHT BE WRONG!
  157. * @param words the number of words in the text
  158. * @param bigwords the number of words that contain more than 3 syllables
  159. * @param sentences number of sentences
  160. * @returns the wheeler smith index as result and the grade level in grade.
  161. * If grade is 0, the index is lower than any grade, if the index is
  162. * 99, it is higher than any grade.
  163. */
  164. static double wheeler_smith(int *grade, int words, int bigwords, int sentences) /*{{{*/
  165. {
  166. double idx=(((double)words)/sentences) * 10.0 * (((double)bigwords)/words);
  167. if (idx<=16) *grade=0;
  168. else if (idx<=20) *grade=5;
  169. else if (idx<=24) *grade=6;
  170. else if (idx<=29) *grade=7;
  171. else if (idx<=34) *grade=8;
  172. else if (idx<=38) *grade=9;
  173. else if (idx<=42) *grade=10;
  174. else *grade=99;
  175. return idx;
  176. }
  177. /*}}}*/
  178. /**
  179. * Calculate Lix formula of Bjrnsson from Sweden.
  180. * @param words the number of words in the text
  181. * @param sentences number of sentences
  182. * @param longwords the number of words that are longer than 6 characters
  183. * @returns the wheeler smith index as result and the grade level in grade.
  184. * If grade is 0, the index is lower than any grade, if the index is
  185. * 99, it is higher than any grade.
  186. */
  187. static double lix(int *grade, int words, int longwords, int sentences) /*{{{*/
  188. {
  189. double idx=((double)words)/sentences+100.0*((double)longwords)/words;
  190. if (idx<34) *grade=0;
  191. else if (idx<38) *grade=5;
  192. else if (idx<41) *grade=6;
  193. else if (idx<44) *grade=7;
  194. else if (idx<48) *grade=8;
  195. else if (idx<51) *grade=9;
  196. else if (idx<54) *grade=10;
  197. else if (idx<57) *grade=11;
  198. else *grade=99;
  199. return idx;
  200. }
  201. /*}}}*/
  202. /**
  203. * Calculate SMOG-Grading.
  204. * @param bigwords the number of words that contain more than 3 syllables
  205. * @param sentences number of sentences
  206. */
  207. static double smog(int bigwords, int sentences) /*{{{*/
  208. {
  209. if (strncmp(docLanguage,"de",2)==0) return sqrt((((double)bigwords)/((double)sentences))*30)-2.0;
  210. else return sqrt((((double)bigwords)/((double)sentences))*30.0)+3.0;
  211. }
  212. /*}}}*/
  213. /*}}}*/
  214. /* word class checks */ /*{{{*/
  215. static int wordcmp(const char *r, const char *s) /*{{{*/
  216. {
  217. int res;
  218. while (*r)
  219. {
  220. if ((res=*r-tolower(*s))!=0) return res;
  221. ++r; ++s;
  222. }
  223. return isalpha(*s);
  224. }
  225. /*}}}*/
  226. /**
  227. * Test if the word is an article. This function uses docLanguage to
  228. * determine the used language.
  229. */
  230. static int article(const char *word, size_t l) /*{{{*/
  231. {
  232. static const char *de[]= /* German articles */ /*{{{*/
  233. {
  234. "der", "die", "das", "des", "dem", "den", "ein", "eine", "einer",
  235. "eines", "einem", "einen", (const char*)0
  236. };
  237. /*}}}*/
  238. static const char *en[]= /* English articles */ /*{{{*/
  239. {
  240. "the", "a", "an", (const char*)0
  241. };
  242. /*}}}*/
  243. const char **list;
  244. if (strncmp(docLanguage,"de",2)==0) list=de;
  245. else list=en;
  246. while (*list) if (wordcmp(*list,word)==0) return 1; else ++list;
  247. return 0;
  248. }
  249. /*}}}*/
  250. /**
  251. * Test if the word is a pronoun. This function uses docLanguage to
  252. * determine the used language.
  253. */
  254. static int pronoun(const char *word, size_t l) /*{{{*/
  255. {
  256. static const char *de[]= /* Pronomen */ /*{{{*/
  257. {
  258. "ich", "du", "er", "sie", "es", "wir", "ihr", "mein", "meine", "dein",
  259. "deine", "sein", "seine", "unser", "unsere", "euer", "eure", "mir",
  260. "mich", "dir", "dich", "ihre", (const char*)0
  261. };
  262. /*}}}*/
  263. static const char *en[]= /* pronouns */ /*{{{*/
  264. {
  265. "i", "me", "we", "us", "you", "he", "him", "she", "her", "it", "they",
  266. "them", "thou", "thee", "ye", "myself", "yourself", "himself",
  267. "herself", "itself", "ourselves", "yourselves", "themselves",
  268. "oneself", "my", "mine", "his", "hers", "yours", "ours", "theirs", "its",
  269. "our", "that", "their", "these", "this", "those", "your", (const char*)0
  270. };
  271. /*}}}*/
  272. const char **list;
  273. if (strncmp(docLanguage,"de",2)==0) list=de;
  274. else list=en;
  275. while (*list) if (wordcmp(*list,word)==0) return 1; else ++list;
  276. return 0;
  277. }
  278. /*}}}*/
  279. /**
  280. * Test if the word is an interrogative pronoun. This function uses
  281. * docLanguage to determine the used language.
  282. */
  283. static int interrogativePronoun(const char *word, size_t l) /*{{{*/
  284. {
  285. static const char *de[]= /* Interrogativpronomen */ /*{{{*/
  286. {
  287. "wer", "was", "wem", "wen", "wessen", "wo", "wie", "warum", "weshalb",
  288. "wann", "wieso", "weswegen", (const char*)0
  289. };
  290. /*}}}*/
  291. static const char *en[]= /* interrogative pronouns */ /*{{{*/
  292. {
  293. "why", "who", "what", "whom", "when", "where", "how", (const char*)0
  294. };
  295. /*}}}*/
  296. const char **list;
  297. if (strncmp(docLanguage,"de",2)==0) list=de;
  298. else list=en;
  299. while (*list) if (wordcmp(*list,word)==0) return 1; else ++list;
  300. return 0;
  301. }
  302. /*}}}*/
  303. static int conjunction(const char *word, size_t l) /*{{{*/
  304. {
  305. static const char *de[]= /* Konjunktionen */ /*{{{*/
  306. {
  307. "und", "oder", "aber", "sondern", "doch", "nur", "blo�", "denn",
  308. "weder", "noch", "sowie", (const char*)0
  309. };
  310. /*}}}*/
  311. static const char *en[]= /* conjunctions */ /*{{{*/
  312. {
  313. "and", "but", "or", "yet", "nor", (const char*)0
  314. };
  315. /*}}}*/
  316. const char **list;
  317. if (strncmp(docLanguage,"de",2)==0) list=de;
  318. else list=en;
  319. while (*list) if (wordcmp(*list,word)==0) return 1; else ++list;
  320. return 0;
  321. }
  322. /*}}}*/
  323. static int nominalization(const char *word, size_t l) /*{{{*/
  324. {
  325. static const char *de[]= /* Nominalisierungsendungen */ /*{{{*/
  326. {
  327. "ung", "heit", "keit", "nis", "tum", (const char*)0
  328. };
  329. /*}}}*/
  330. static const char *en[]= /* nominalization suffixes */ /*{{{*/
  331. {
  332. /* a bit limited, but it is exactly what the original style(1) did */
  333. "tion", "ment", "ence", "ance", (const char*)0
  334. };
  335. /*}}}*/
  336. const char **list;
  337. /* exclude words too short to have such long suffixes */
  338. if (l < 7) return 0;
  339. if (strncmp(docLanguage,"de",2)==0) list=de;
  340. else list=en;
  341. while (*list) if (wordcmp(*list,word+l-strlen(*list))==0) return 1; else ++list;
  342. return 0;
  343. }
  344. /*}}}*/
  345. static int subConjunction(const char *word, size_t l) /*{{{*/
  346. {
  347. static const char *de[]= /* unterordnende Konjunktionen */ /*{{{*/
  348. {
  349. "weil", "da", "dadurch", "wenn", "falls", "sofern", "obwohl",
  350. "obgleich", "als", "nachdem", "w�hrend", "wie", "ob", "je",
  351. "desto", "damit", "dass", "indem", "um zu", (const char*)0
  352. };
  353. /*}}}*/
  354. static const char *en[]= /* subordinating conjunctions */ /*{{{*/
  355. {
  356. "after","because", "lest", "till", "'til", "although", "before",
  357. "now that", "unless", "as", "even if", "provided that", "provided",
  358. "until", "as if", "even though", "since", "as long as", "so that",
  359. "whenever", "as much as", "if", "than", "as soon as", "inasmuch",
  360. "in order that", "though", "while", (const char*)0
  361. };
  362. /*}}}*/
  363. const char **list;
  364. if (strncmp(docLanguage,"de",2)==0) list=de;
  365. else list=en;
  366. while (*list)
  367. {
  368. if (wordcmp(*list,word)==0)
  369. {
  370. phraseEnd = word+strlen(*list);
  371. return 1;
  372. }
  373. else ++list;
  374. }
  375. return 0;
  376. }
  377. /*}}}*/
  378. static int preposition(const char *word, size_t l) /*{{{*/
  379. {
  380. static const char *de[]= /* Pr�positionen */ /*{{{*/
  381. {
  382. "aus", "au�er", "bei", "mit", "nach", "seit", "von", "zu",
  383. "bis", "durch", "f�r", "gegen", "ohne", "um", "an", "auf",
  384. "hinter", "in", "neben", "�ber", "unter", "vor", "zwischen",
  385. "anstatt", "statt", "trotz", "w�hrend", "wegen", (const char*)0
  386. };
  387. /*}}}*/
  388. static const char *en[]= /* prepositions */ /*{{{*/
  389. {
  390. "aboard", "about", "above", "according to", "across from",
  391. "after", "against", "alongside", "alongside of", "along with",
  392. "amid", "among", "apart from", "around", "aside from", "at", "away from",
  393. "back of", "because of", "before", "behind", "below", "beneath", "beside",
  394. "besides", "between", "beyond", "but", "by means of",
  395. "concerning", "considering", "despite", "down", "down from", "during",
  396. "except", "except for", "excepting for", "from among",
  397. "from between", "from under", "in addition to", "in behalf of",
  398. "in front of", "in place of", "in regard to", "inside of", "inside",
  399. "in spite of", "instead of", "into", "like", "near to", "off",
  400. "on account of", "on behalf of", "onto", "on top of", "on", "opposite",
  401. "out of", "out", "outside", "outside of", "over to", "over", "owing to",
  402. "past", "prior to", "regarding", "round about", "round",
  403. "since", "subsequent to", "together", "with", "throughout", "through",
  404. "till", "toward", "under", "underneath", "until", "unto", "up",
  405. "up to", "upon", "with", "within", "without", "across", "along",
  406. "by", "of", "in", "to", "near", "of", "from", (const char*)0
  407. };
  408. /*}}}*/
  409. const char **list;
  410. if (strncmp(docLanguage,"de",2)==0) list=de;
  411. else list=en;
  412. while (*list)
  413. {
  414. if (wordcmp(*list,word)==0)
  415. {
  416. phraseEnd = word+strlen(*list);
  417. return 1;
  418. }
  419. else ++list;
  420. }
  421. return 0;
  422. }
  423. /*}}}*/
  424. static int auxVerb(const char *word, size_t l) /*{{{*/
  425. {
  426. static const char *de[]= /* Hilfsverben */ /*{{{*/
  427. {
  428. "haben", "habe", "hast", "hat", "habt", "gehabt", "h�tte", "h�ttest",
  429. "h�tten", "h�ttet",
  430. "werden", "werde", "wirst", "wird", "werdet", "geworden", "w�rde",
  431. "w�rdest", "w�rden", "w�rdet",
  432. "k�nnen", "kann", "kannst", "k�nnt", "konnte", "konntest", "konnten",
  433. "konntet", "gekonnt", "k�nnte", "k�nntest", "k�nnten", "k�nntet",
  434. "m�ssen", "muss", "musst", "m�sst", "musste", "musstest", "mussten",
  435. "gemusst", "m�sste", "m�sstest", "m�ssten", "m�sstet",
  436. "sollen", "soll", "sollst", "sollt", "sollte", "solltest", "solltet",
  437. "sollten", "gesollt",
  438. (const char*)0
  439. };
  440. /*}}}*/
  441. static const char *en[]= /* auxiliary verbs */ /*{{{*/
  442. {
  443. "will", "shall", "cannot", "may", "need to", "would", "should",
  444. "could", "might", "must", "ought", "ought to", "can't", "can",
  445. (const char*)0
  446. };
  447. /*}}}*/
  448. const char **list;
  449. if (strncmp(docLanguage,"de",2)==0) list=de;
  450. else list=en;
  451. while (*list)
  452. {
  453. if (wordcmp(*list,word)==0)
  454. {
  455. phraseEnd = word+strlen(*list);
  456. return 1;
  457. }
  458. else ++list;
  459. }
  460. return 0;
  461. }
  462. /*}}}*/
  463. static int tobeVerb(const char *word, size_t l) /*{{{*/
  464. {
  465. static const char *de[]= /* Hilfsverb sein */ /*{{{*/
  466. {
  467. "sein", "bin", "bist", "ist", "sind", "seid", "war", "warst", "wart",
  468. "waren", "gewesen", "w�re", "w�rst", "w�r", "w�ren", "w�rt", "w�ret",
  469. (const char*)0
  470. };
  471. /*}}}*/
  472. static const char *en[]= /* auxiliary verb to be */ /*{{{*/
  473. {
  474. "be", "being", "was", "were", "been", "are", "is", (const char*)0
  475. };
  476. /*}}}*/
  477. const char **list;
  478. if (strncmp(docLanguage,"de",2)==0) list=de;
  479. else list=en;
  480. while (*list) if (wordcmp(*list,word)==0) return 1; else ++list;
  481. return 0;
  482. }
  483. /*}}}*/
  484. /*}}}*/
  485. /* syllable counting */ /*{{{*/
  486. /**
  487. * Check if the character is pronounced as a vowel.
  488. */
  489. static int vowel(char c) /*{{{*/
  490. {
  491. switch (lc_ctype_int)
  492. {
  493. case ASCII: return (c=='a' || c=='e' || c=='i' || c=='o' || c=='u' || c=='y');
  494. case ISO_8859_1: return (c=='a' || c=='' || c=='e' || c=='i' || c=='o' || c=='' || c=='u' || c=='' || c=='y');
  495. default: assert(0);
  496. }
  497. }
  498. /*}}}*/
  499. /**
  500. * Count syllables for english words by counting vowel-consonant pairs.
  501. * @param s the word
  502. * @param l the word's length
  503. */
  504. static int syll_en(const char *s, size_t l) /*{{{*/
  505. {
  506. int count=0;
  507. if (l>=2 && *(s+l-2)=='e' && *(s+l-1)=='d') l-=2;
  508. while (l)
  509. {
  510. if (l>=2 && vowel(*s) && !vowel(*(s+1))) { ++count; s+=2; l-=2; }
  511. else { ++s; --l; }
  512. }
  513. return (count==0 ? 1 : count);
  514. }
  515. /*}}}*/
  516. /**
  517. * Count syllables for German words by counting vowel-consonant or
  518. * consonant-vowel pairs, depending on the first character being a vowel or
  519. * not. If it is, a trailing e will be handled with a special rule. This
  520. * algorithm fails on "vor-ueber".
  521. * @param s the word
  522. * @param l the word's length
  523. */
  524. static int syll_de(const char *s, size_t l) /*{{{*/
  525. {
  526. int count=0;
  527. size_t ol=l;
  528. if (vowel(*s))
  529. while (l)
  530. {
  531. if (l>=2 && vowel(*s) && !vowel(*(s+1))) { ++count; s+=2; l-=2; }
  532. else if (l==1 && ol>1 && !vowel(*(s-1)) && *s=='e') { ++count; s+=1; l-=1; }
  533. else { ++s; --l; }
  534. }
  535. else
  536. while (l)
  537. {
  538. if (l>=2 && !vowel(*s) && vowel(*(s+1))) { ++count; s+=2; l-=2; }
  539. else { ++s; --l; }
  540. }
  541. return (count==0 ? 1 : count);
  542. }
  543. /*}}}*/
  544. /**
  545. * Count syllables. First, charset is set to the used character set.
  546. * Depending on the language, the right counting function is called.
  547. * @param s the word
  548. * @param l the word's length
  549. */
  550. static int syll(const char *s, size_t l) /*{{{*/
  551. {
  552. assert(s!=(const char*)0);
  553. assert(l>=1);
  554. if (strncmp(docLanguage,"de",2)==0) return syll_de(s,l);
  555. else return syll_en(s,l);
  556. }
  557. /*}}}*/
  558. /*}}}*/
  559. /* global style() variables */ /*{{{*/
  560. static int characters;
  561. static int syllables;
  562. static int words;
  563. static int shortwords;
  564. static int longwords;
  565. static int bigwords;
  566. static int sentences;
  567. static int questions;
  568. static int passiveSent;
  569. static int beginArticles;
  570. static int beginPronouns;
  571. static int pronouns;
  572. static int beginInterrogativePronouns;
  573. static int interrogativePronouns;
  574. static int beginConjunctions;
  575. static int conjunctions;
  576. static int nominalizations;
  577. static int prepositions;
  578. static int beginPrepositions;
  579. static int beginSubConjunctions;
  580. static int subConjunctions;
  581. static int auxVerbs;
  582. static int tobeVerbs;
  583. static int shortestLine,shortestLength;
  584. static int longestLine,longestLength;
  585. static int paragraphs;
  586. static int printLongSentences=0;
  587. static int printNomSentences=0;
  588. static int printPassiveSentences=0;
  589. static float printARI=0.0;
  590. static struct Hit lengths;
  591. /*}}}*/
  592. /**
  593. * Process one sentence.
  594. * @param str sentence
  595. * @param length its length
  596. */
  597. static void style(const char *str, size_t length, const char *file, int line) /*{{{*/
  598. {
  599. int firstWord=1;
  600. int inword=0;
  601. int innumber=0;
  602. int wordLength=-1;
  603. int sentWords=0;
  604. int sentLetters=0;
  605. int count;
  606. int passive=0;
  607. int nom=0;
  608. const char *s=str;
  609. if (length==0) { ++paragraphs; return; }
  610. assert(str!=(const char*)0);
  611. assert(length>=2);
  612. phraseEnd = (const char*)0;
  613. while (*s)
  614. {
  615. if (inword)
  616. {
  617. if (!isalpha(*s))
  618. {
  619. inword=0;
  620. count=syll(s-wordLength,wordLength);
  621. syllables+=count;
  622. if (count>=3) ++bigwords;
  623. else if (count==1) ++shortwords;
  624. if (wordLength>6) ++longwords;
  625. if (s-wordLength > phraseEnd)
  626. {
  627. /* part of speech tagging-- order matters! */
  628. if (article(s-wordLength,wordLength) && firstWord) ++beginArticles;
  629. else if (pronoun(s-wordLength,wordLength))
  630. {
  631. ++pronouns;
  632. if (firstWord) ++beginPronouns;
  633. }
  634. else if (interrogativePronoun(s-wordLength,wordLength))
  635. {
  636. ++interrogativePronouns;
  637. if (firstWord) ++beginInterrogativePronouns;
  638. }
  639. else if (conjunction(s-wordLength,wordLength))
  640. {
  641. ++conjunctions;
  642. if (firstWord) ++beginConjunctions;
  643. }
  644. else if (subConjunction(s-wordLength,wordLength))
  645. {
  646. ++subConjunctions;
  647. if (firstWord) ++beginSubConjunctions;
  648. }
  649. else if (preposition(s-wordLength,wordLength))
  650. {
  651. ++prepositions;
  652. if (firstWord) ++beginPrepositions;
  653. }
  654. else if (tobeVerb(s-wordLength,wordLength))
  655. {
  656. ++passive;
  657. ++tobeVerbs;
  658. }
  659. else if (auxVerb(s-wordLength,wordLength)) ++auxVerbs;
  660. else if (nominalization(s-wordLength,wordLength))
  661. {
  662. ++nom;
  663. ++nominalizations;
  664. }
  665. }
  666. if (firstWord) firstWord = 0;
  667. }
  668. else
  669. {
  670. ++wordLength;
  671. ++characters;
  672. ++sentLetters;
  673. }
  674. }
  675. else if (innumber)
  676. {
  677. if (!isdigit(*s))
  678. {
  679. innumber=0;
  680. ++syllables;
  681. }
  682. else
  683. {
  684. ++wordLength;
  685. ++characters;
  686. ++sentLetters;
  687. }
  688. }
  689. else
  690. {
  691. if (isalpha(*s))
  692. {
  693. ++words;
  694. ++sentWords;
  695. inword=1;
  696. wordLength=1;
  697. ++characters;
  698. ++sentLetters;
  699. }
  700. else if (isdigit(*s))
  701. {
  702. ++words;
  703. ++sentWords;
  704. innumber=1;
  705. wordLength=1;
  706. ++characters;
  707. ++sentLetters;
  708. }
  709. }
  710. ++s;
  711. }
  712. ++sentences;
  713. if (shortestLine==0 || sentWords<shortestLength)
  714. {
  715. shortestLine=sentences;
  716. shortestLength=sentWords;
  717. }
  718. if (longestLine==0 || sentWords>longestLength)
  719. {
  720. longestLine=sentences;
  721. longestLength=sentWords;
  722. }
  723. if (str[length-1]=='?') ++questions;
  724. noteHit(&lengths,sentWords);
  725. if (passive) ++passiveSent;
  726. if ((printLongSentences && sentWords>=printLongSentences)
  727. || (printARI && ari(sentLetters,sentWords,1)>printARI)
  728. || (printPassiveSentences && passive)
  729. || (printNomSentences && nom)) printf("%s:%d: %s\n",file,line,str);
  730. }
  731. /*}}}*/
  732. static void print_usage(FILE *handle) /*{{{*/
  733. {
  734. fputs(_("\
  735. Usage: style [-L language] [-l length] [-r ari] [file ...]\n\
  736. style [--language language] [--print-long length] [--print-ari ari]\n\
  737. [file ...]\n\
  738. style --version\n"),handle);
  739. }
  740. /*}}}*/
  741. int main(int argc, char *argv[]) /*{{{*/
  742. {
  743. /* variables */ /*{{{*/
  744. int usage=0,c;
  745. static struct option lopts[]=
  746. {
  747. { "help", no_argument, 0, 'h' },
  748. { "print-long", required_argument, 0, 'l' },
  749. { "language", required_argument, 0, 'L' },
  750. { "print-ari", required_argument, 0, 'r' },
  751. { "version", no_argument, 0, 'v' },
  752. { "print-passive", no_argument, 0, 'p' },
  753. { "print-nom", no_argument, 0, 'N' },
  754. { "print-nom-passive", no_argument, 0, 'N' },
  755. { (const char*)0, 0, 0, '\0' }
  756. };
  757. /*}}}*/
  758. /* locale */ /*{{{*/
  759. setlocale(LC_ALL,"");
  760. #ifdef HAVE_GETTEXT
  761. bindtextdomain("diction", LOCALEDIR);
  762. textdomain("diction");
  763. #endif
  764. /*}}}*/
  765. /* parse options */ /*{{{*/
  766. #if 0
  767. lc_ctype=setlocale(LC_CTYPE,(const char*)0);
  768. docLanguage=setlocale(LC_MESSAGES,(const char*)0);
  769. #else
  770. if ((lc_ctype=getenv("LC_CTYPE"))==(const char*)0) lc_ctype="C";
  771. if ((docLanguage=getenv("LC_MESSAGES"))==(const char*)0) docLanguage="C";
  772. #endif
  773. if (strcmp(docLanguage,"C")==0) docLanguage="en";
  774. if (strstr(lc_ctype,"8859-1")) lc_ctype_int=ISO_8859_1;
  775. else lc_ctype_int=ASCII;
  776. while ((c=getopt_long(argc,argv,"l:L:r:hpnN",lopts,(int*)0))!=EOF) switch(c)
  777. {
  778. case 'l':
  779. {
  780. char *end;
  781. printLongSentences=strtol(optarg,&end,10);
  782. if (end==optarg || *end!='\0') usage=1;
  783. break;
  784. }
  785. case 'L':
  786. {
  787. docLanguage=optarg;
  788. break;
  789. }
  790. case 'r':
  791. {
  792. char *end;
  793. printARI=strtod(optarg,&end);
  794. if (end==optarg || *end!='\0') usage=1;
  795. break;
  796. }
  797. case 'p':
  798. {
  799. printPassiveSentences=1;
  800. break;
  801. }
  802. case 'N':
  803. {
  804. printNomSentences=1;
  805. break;
  806. }
  807. case 'n':
  808. {
  809. printNomSentences=1;
  810. printPassiveSentences=1;
  811. break;
  812. }
  813. case 'v': fputs("GNU style " VERSION "\n",stdout); exit(0);
  814. case 'h': usage=2; break;
  815. default: usage=1; break;
  816. }
  817. if (usage==1)
  818. {
  819. print_usage(stderr);
  820. fputs("\n",stderr);
  821. fputs(_("Try style -h|--help for more information.\n"),stderr);
  822. exit(1);
  823. }
  824. else if (usage==2)
  825. {
  826. print_usage(stdout);
  827. fputs("\n",stdout);
  828. fputs(_("Analyse surface characteristics of a document.\n\n"),stdout);
  829. fputs(_("\
  830. -L, --language set the document language.\n\
  831. -l, --print-long print all sentences longer than <length> words\n\
  832. -r, --print-ari print all sentences with an ARI greater than than <ari>\n\
  833. -p, --print-passive print all sentences phrased in the passive voice\n\
  834. -N, --print-nom print all sentences containing nominalizations\n\
  835. -n, --print-nom-passive print all sentences phrased in the passive voice or\n\
  836. containing nominalizations\n"),stdout);
  837. fputs(_("\
  838. -h, --help print this message\n\
  839. --version print the version\n"),stdout);
  840. fputs("\n",stdout);
  841. fputs(_("Report bugs to <michael@moria.de>.\n"),stdout);
  842. exit(0);
  843. }
  844. /*}}}*/
  845. newHit(&lengths);
  846. if (optind==argc) sentence("style",stdin,"(stdin)",style,docLanguage);
  847. else while (optind<argc)
  848. {
  849. FILE *fp;
  850. if ((fp=fopen(argv[optind],"r"))==(FILE*)0)
  851. fprintf(stderr,_("style: Opening `%s' failed (%s).\n"),argv[optind],strerror(errno));
  852. else
  853. {
  854. sentence("style",fp,argv[optind],style,docLanguage);
  855. fclose(fp);
  856. }
  857. ++optind;
  858. }
  859. if (sentences==0)
  860. {
  861. printf(_("No sentences found.\n"));
  862. }
  863. else
  864. {
  865. int wsg;
  866. int lixg;
  867. int i,shortLength,shortSent,longLength,longSent;
  868. printf(_("readability grades:\n"));
  869. printf(" %s: %.1f\n",_("Kincaid"),kincaid(syllables,words,sentences));
  870. printf(" %s: %.1f\n",_("ARI"),ari(characters,words,sentences));
  871. printf(" %s: %.1f\n",_("Coleman-Liau"),coleman_liau(characters,words,sentences));
  872. printf(" %s: %.1f\n",_("Flesch Index"),flesch(syllables,words,sentences));
  873. printf(" %s: %.1f\n",_("Fog Index"),fog(words,bigwords,sentences));
  874. #ifdef MIGHT_BE_WRONG
  875. printf(" %s: %.1f\n",_("1. WSTF Index"),wstf(words,shortwords,longwords,bigwords,sentences));
  876. printf(" %s: %.1f = ",_("Wheeler-Smith Index"),wheeler_smith(&wsg,words,bigwords,sentences));
  877. if (wsg==0) printf(_("below school year 5\n"));
  878. else if (wsg==99) printf(_("higher than school year 10\n"));
  879. else printf(_("school year %d\n"),wsg);
  880. #endif
  881. printf(" %s: %.1f = ",_("Lix"),lix(&lixg,words,longwords,sentences));
  882. if (lixg==0) printf(_("below school year 5\n"));
  883. else if (lixg==99) printf(_("higher than school year 11\n"));
  884. else printf(_("school year %d\n"),lixg);
  885. printf(" %s: %.1f\n",_("SMOG-Grading"),smog(bigwords,sentences));
  886. printf(_("sentence info:\n"));
  887. printf(_(" %d characters\n"),characters);
  888. printf(_(" %d words, average length %.2f characters = %.2f syllables\n"),words,((double)characters)/words,((double)syllables)/words);
  889. printf(_(" %d sentences, average length %.1f words\n"),sentences,((double)words)/sentences);
  890. shortLength=((double)words)/sentences-4.5;
  891. if (shortLength<1) shortLength=1;
  892. for (i=0,shortSent=0; i<=shortLength; ++i) shortSent+=lengths.data[i];
  893. printf(_(" %d%% (%d) short sentences (at most %d words)\n"),100*shortSent/sentences,shortSent,shortLength);
  894. longLength=((double)words)/sentences+10.5;
  895. for (i=longLength,longSent=0; i<=lengths.size; ++i) longSent+=lengths.data[i];
  896. printf(_(" %d%% (%d) long sentences (at least %d words)\n"),100*longSent/sentences,longSent,longLength);
  897. printf(_(" %d paragraphs, average length %.1f sentences\n"),paragraphs,((double)sentences)/paragraphs);
  898. printf(_(" %d%% (%d) questions\n"),100*questions/sentences,questions);
  899. printf(_(" %d%% (%d) passive sentences\n"),100*passiveSent/sentences,passiveSent);
  900. printf(_(" longest sent %d wds at sent %d; shortest sent %d wds at sent %d\n"),longestLength,longestLine,shortestLength,shortestLine);
  901. /*
  902. Missing output:
  903. sentence types:
  904. simple 100% (1) complex 0% (0)
  905. compound 0% (0) compound-complex 0% (0)
  906. word usage:
  907. verb types as % of total verbs
  908. tobe 100% (1) aux 0% (0) inf 0% (0)
  909. passives as % of non-inf verbs 0% (0)
  910. types as % of total
  911. prep 0.0% (0) conj 0.0% (0) adv 0.0% (0)
  912. noun 25.0% (1) adj 25.0% (1) pron 25.0% (1)
  913. nominalizations 0 % (0)
  914. */
  915. if (strncmp(docLanguage,"en",2)==0)
  916. {
  917. printf(_("word usage:\n"));
  918. printf(_(" verb types:\n"));
  919. printf(_(" to be (%d) auxiliary (%d) \n"), tobeVerbs, auxVerbs);
  920. printf(_(" types as %% of total:\n"));
  921. printf(_(" conjunctions %1.f% (%d) pronouns %1.f% (%d) prepositions %1.f% (%d)\n"),
  922. (100.0*(conjunctions+subConjunctions))/words,
  923. conjunctions+subConjunctions,
  924. (100.0*pronouns)/words, pronouns, (100.0*prepositions)/words,
  925. prepositions);
  926. printf(_(" nominalizations %1.f% (%d)\n"),
  927. (100.0*nominalizations)/words, nominalizations);
  928. }
  929. printf(_("sentence beginnings:\n"));
  930. printf(_(" pronoun (%d) interrogative pronoun (%d) article (%d)\n"),beginPronouns,beginInterrogativePronouns,beginArticles);
  931. if (strncmp(docLanguage,"en",2)==0)
  932. {
  933. printf(_(" subordinating conjunction (%d) conjunction (%d) preposition (%d)\n"), beginSubConjunctions,beginConjunctions,beginPrepositions);
  934. }
  935. /*
  936. subject opener: noun (0) pron (1) pos (0) adj (0) art (0) tot 100%
  937. prep 0% (0) adv 0% (0)
  938. verb 0% (0) sub_conj 0% (0) conj 0% (0)
  939. expletives 0% (0)
  940. */
  941. }
  942. exit(0);
  943. }
  944. /*}}}*/