[Midnightbsd-cvs] mports [20508] trunk/textproc: add libtextcat
laffer1 at midnightbsd.org
laffer1 at midnightbsd.org
Sat Oct 10 16:16:48 EDT 2015
Revision: 20508
http://svnweb.midnightbsd.org/mports/?rev=20508
Author: laffer1
Date: 2015-10-10 16:16:47 -0400 (Sat, 10 Oct 2015)
Log Message:
-----------
add libtextcat
Modified Paths:
--------------
trunk/textproc/Makefile
Added Paths:
-----------
trunk/textproc/libtextcat/
trunk/textproc/libtextcat/Makefile
trunk/textproc/libtextcat/distinfo
trunk/textproc/libtextcat/files/
trunk/textproc/libtextcat/files/patch-src_Makefile.in
trunk/textproc/libtextcat/files/patch-src_constants.h
trunk/textproc/libtextcat/files/patch-src_fingerprint.c
trunk/textproc/libtextcat/files/patch-src_fingerprint.h
trunk/textproc/libtextcat/files/patch-src_textcat.c
trunk/textproc/libtextcat/files/patch-src_textcat.h
trunk/textproc/libtextcat/files/patch-src_utf8misc.c
trunk/textproc/libtextcat/files/patch-src_utf8misc.h
trunk/textproc/libtextcat/pkg-descr
trunk/textproc/libtextcat/pkg-plist
Modified: trunk/textproc/Makefile
===================================================================
--- trunk/textproc/Makefile 2015-10-10 20:15:23 UTC (rev 20507)
+++ trunk/textproc/Makefile 2015-10-10 20:16:47 UTC (rev 20508)
@@ -56,6 +56,7 @@
SUBDIR += latex-service
SUBDIR += libcroco
SUBDIR += libcue
+SUBDIR += libtextcat
SUBDIR += libtre
SUBDIR += libuninameslist
SUBDIR += libwpd
Added: trunk/textproc/libtextcat/Makefile
===================================================================
--- trunk/textproc/libtextcat/Makefile (rev 0)
+++ trunk/textproc/libtextcat/Makefile 2015-10-10 20:16:47 UTC (rev 20508)
@@ -0,0 +1,41 @@
+# Created by: thierry at pompo.net
+# $FreeBSD: head/textproc/libtextcat/Makefile 360343 2014-07-03 08:29:15Z tijl $
+# $MidnightBSD$
+
+PORTNAME= libtextcat
+PORTVERSION= 2.2
+CATEGORIES= textproc
+MASTER_SITES= http://software.wise-guys.nl/download/
+
+MAINTAINER= ports at MidnightBSD.org
+COMMENT= Language guessing by N-Gram-Based Text Categorization
+
+LICENSE= bsd3
+LICENSE_FILE= ${WRKSRC}/LICENSE
+
+GNU_CONFIGURE= yes
+USES= libtool
+USE_LDCONFIG= yes
+
+OPTIONS_DEFINE= DOCS
+
+PORTDOCS= README TODO
+
+post-install:
+ ${INSTALL_DATA} ${WRKSRC}/src/textcat.h ${STAGEDIR}${PREFIX}/include/
+ ${MKDIR} ${STAGEDIR}${DATADIR}/LM
+ @${ECHO_MSG} "Installing language models provided in Gertjan van Noord's TextCat package"
+ (cd ${WRKSRC}/langclass/LM && \
+ ${FIND} . -name "*.lm" -exec ${INSTALL_DATA} "{}" "${STAGEDIR}${DATADIR}/LM/{}" \;)
+ ${INSTALL_DATA} ${WRKSRC}/langclass/conf.txt "${STAGEDIR}${DATADIR}"
+ ${MKDIR} ${STAGEDIR}${DOCSDIR}
+ ${INSTALL_DATA} ${PORTDOCS:S|^|${WRKSRC}/|} ${STAGEDIR}${DOCSDIR}
+
+regression-test:
+ (cd ${WRKSRC}/langclass/ && \
+ for t in `${LS} ShortTexts/*.txt` ; do \
+ ${ECHO_MSG} "Analyzing $$t..." ; \
+ ../src/testtextcat conf.txt < $$t ; \
+ done)
+
+.include <bsd.port.mk>
Property changes on: trunk/textproc/libtextcat/Makefile
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/textproc/libtextcat/distinfo
===================================================================
--- trunk/textproc/libtextcat/distinfo (rev 0)
+++ trunk/textproc/libtextcat/distinfo 2015-10-10 20:16:47 UTC (rev 20508)
@@ -0,0 +1,2 @@
+SHA256 (libtextcat-2.2.tar.gz) = 5677badffc48a8d332e345ea4fe225e3577f53fc95deeec8306000b256829655
+SIZE (libtextcat-2.2.tar.gz) = 540999
Property changes on: trunk/textproc/libtextcat/distinfo
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/textproc/libtextcat/files/patch-src_Makefile.in
===================================================================
--- trunk/textproc/libtextcat/files/patch-src_Makefile.in (rev 0)
+++ trunk/textproc/libtextcat/files/patch-src_Makefile.in 2015-10-10 20:16:47 UTC (rev 20508)
@@ -0,0 +1,65 @@
+--- src/Makefile.in.orig 2003-05-22 13:39:52.000000000 +0200
++++ src/Makefile.in 2011-01-31 20:51:27.000000000 +0100
+@@ -126,18 +126,18 @@
+
+ WARNS = -W -Wall -Wshadow -Wpointer-arith
+ IFLAGS =
+-FLAGS = -g -O3 -funroll-loops -D_THREAD_SAFE -D_GNU_SOURCE
++FLAGS = -g -funroll-loops -D_THREAD_SAFE -D_GNU_SOURCE
+ VERBOSE = -DVERBOSE
+ AM_CFLAGS = $(IFLAGS) $(VERBOSE) $(WARNS) $(FLAGS)
+ AM_LDFLAGS = -g
+
+ noinst_HEADERS = \
+- common.h constants.h fingerprint.h textcat.h wg_mempool.h
++ common.h constants.h fingerprint.h textcat.h wg_mempool.h utf8misc.h
+
+
+ lib_LTLIBRARIES = libtextcat.la
+ libtextcat_la_SOURCES = \
+- common.c fingerprint.c textcat.c wg_mempool.c
++ common.c fingerprint.c textcat.c wg_mempool.c utf8misc.c
+
+
+ bin_PROGRAMS = createfp
+@@ -156,7 +156,7 @@
+ libtextcat_la_LDFLAGS =
+ libtextcat_la_LIBADD =
+ am_libtextcat_la_OBJECTS = common.lo fingerprint.lo textcat.lo \
+- wg_mempool.lo
++ wg_mempool.lo utf8misc.lo
+ libtextcat_la_OBJECTS = $(am_libtextcat_la_OBJECTS)
+ bin_PROGRAMS = createfp$(EXEEXT)
+ noinst_PROGRAMS = testtextcat$(EXEEXT)
+@@ -187,7 +187,9 @@
+ $(AM_LDFLAGS) $(LDFLAGS) -o $@
+ DIST_SOURCES = $(libtextcat_la_SOURCES) $(createfp_SOURCES) \
+ $(testtextcat_SOURCES)
+-HEADERS = $(noinst_HEADERS)
++
++# Needed for LibreOffice
++HEADERS = $(noinst_HEADERS) $(CONFIG_HEADER)
+
+ DIST_COMMON = $(noinst_HEADERS) Makefile.am Makefile.in config.h.in
+ SOURCES = $(libtextcat_la_SOURCES) $(createfp_SOURCES) $(testtextcat_SOURCES)
+@@ -264,6 +266,11 @@
+ else :; fi; \
+ done
+
++install-HEADERS:
++ @$(NORMAL_INSTALL)
++ $(mkinstalldirs) $(DESTDIR)$(pkgincludedir)
++ $(INSTALL_HEADER) $(HEADERS) $(DESTDIR)$(pkgincludedir)
++
+ uninstall-binPROGRAMS:
+ @$(NORMAL_UNINSTALL)
+ @list='$(bin_PROGRAMS)'; for p in $$list; do \
+@@ -490,7 +497,7 @@
+
+ install-data-am:
+
+-install-exec-am: install-binPROGRAMS install-libLTLIBRARIES
++install-exec-am: install-binPROGRAMS install-libLTLIBRARIES install-HEADERS
+
+ install-info: install-info-am
+
Property changes on: trunk/textproc/libtextcat/files/patch-src_Makefile.in
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/textproc/libtextcat/files/patch-src_constants.h
===================================================================
--- trunk/textproc/libtextcat/files/patch-src_constants.h (rev 0)
+++ trunk/textproc/libtextcat/files/patch-src_constants.h 2015-10-10 20:16:47 UTC (rev 20508)
@@ -0,0 +1,45 @@
+--- src/constants.h.orig Thu May 22 13:32:43 2003
++++ src/constants.h Thu Aug 23 22:47:07 2007
+@@ -39,6 +39,8 @@
+ */
+ #include <limits.h>
+
++#define _UTF8_
++
+ #define DESCRIPTION "out of place"
+
+ /* Reported matches are those fingerprints with a score less than best
+@@ -59,14 +61,21 @@
+ /* Maximum number of n-grams in a fingerprint */
+ #define MAXNGRAMS 400
+
+-/* Maximum size of an n-gram? */
+-#define MAXNGRAMSIZE 5
++/* Maximum number of character of an n-gram? */
++#define MAXNGRAMSYMBOL 5
++
++/* Maximum size of the string representing an n-gram (must be greater than number of symbol) */
++#ifdef _UTF8_
++#define MAXNGRAMSIZE 20
++#else
++#define MAXNGRAMSIZE MAXNGRAMSYMBOL
++#endif
+
+ /* Which characters are not acceptable in n-grams? */
+ #define INVALID(c) (isspace((int)c) || isdigit((int)c))
+
+ /* Minimum size (in characters) for accepting a document */
+-#define MINDOCSIZE 25
++#define MINDOCSIZE 6
+
+ /* Maximum penalty for missing an n-gram in fingerprint */
+ #define MAXOUTOFPLACE 400
+@@ -75,5 +84,8 @@
+ #define TABLEPOW 13
+
+ #define MAXSCORE INT_MAX
++
++/* where the fingerprints files are stored */
++#define DEFAULT_FINGERPRINTS_PATH ""
+
+ #endif
Property changes on: trunk/textproc/libtextcat/files/patch-src_constants.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/textproc/libtextcat/files/patch-src_fingerprint.c
===================================================================
--- trunk/textproc/libtextcat/files/patch-src_fingerprint.c (rev 0)
+++ trunk/textproc/libtextcat/files/patch-src_fingerprint.c 2015-10-10 20:16:47 UTC (rev 20508)
@@ -0,0 +1,164 @@
+--- src/fingerprint.c.orig Thu May 22 13:32:43 2003
++++ src/fingerprint.c Thu Aug 23 22:47:07 2007
+@@ -63,6 +63,10 @@
+ * - put table/heap datastructure in a separate file.
+ */
+
++#ifndef _UTF8_
++#define _UTF8_
++#endif
++
+ #include "config.h"
+ #include <stdio.h>
+ #ifdef HAVE_STDLIB_H
+@@ -80,10 +84,12 @@
+ #include "wg_mempool.h"
+ #include "constants.h"
+
++#include "utf8misc.h"
+
+ #define TABLESIZE (1<<TABLEPOW)
+ #define TABLEMASK ((TABLESIZE)-1)
+
++
+ typedef struct {
+
+ sint2 rank;
+@@ -134,29 +140,14 @@
+ }
+
+
+-/* checks if n-gram lex is a prefix of key and of length len */
+-inline int issame( char *lex, char *key, int len )
+-{
+- int i;
+- for (i=0; i<len; i++) {
+- if ( key[i] != lex[i] ) {
+- return 0;
+- }
+- }
+- if ( lex[i] != 0 ) {
+- return 0;
+- }
+- return 1;
+-}
+-
+
+ /* increases frequency of ngram(p,len) */
+-static inline int increasefreq( table_t *t, char *p, int len )
+-{
+- uint4 hash = simplehash( p, len ) & TABLEMASK;
++static int increasefreq( table_t *t, char *p, int len )
++{
++ uint4 hash = simplehash( p, len ) & TABLEMASK;
+ entry_t *entry = t->table[ hash ];
+-
+- while ( entry ) {
++
++ while ( entry ) {
+ if ( issame( entry->str, p, len ) ) {
+ /*** Found it! ***/
+ entry->cnt++;
+@@ -168,7 +159,7 @@
+ }
+
+ /*** Not found, so create ***/
+- entry = wgmempool_alloc( t->pool, sizeof(entry_t) );
++ entry = (entry_t*)(wgmempool_alloc( t->pool, sizeof(entry_t) ));
+ strcpy( entry->str, p );
+ entry->cnt = 1;
+
+@@ -181,12 +172,12 @@
+ #if 0
+
+ /* looks up ngram(p,len) */
+-static entry_t *findfreq( table_t *t, char *p, int len )
+-{
+- uint4 hash = simplehash( p, len ) & TABLEMASK;
++static entry_t *findfreq( table_t *t, char *p, int len )
++{
++ uint4 hash = simplehash( p, len ) & TABLEMASK;
+ entry_t *entry = t->table[ hash ];
+-
+- while ( entry ) {
++
++ while ( entry ) {
+ if ( issame( entry->str, p, len ) ) {
+ return entry;
+ }
+@@ -219,7 +210,7 @@
+ #define GREATER(x,y) ((x).cnt > (y).cnt)
+ #define LESS(x,y) ((x).cnt < (y).cnt)
+
+-inline static void siftup( table_t *t, unsigned int child )
++static void siftup( table_t *t, unsigned int child )
+ {
+ entry_t *heap = t->heap;
+ unsigned int parent = (child-1) >> 1;
+@@ -241,7 +232,7 @@
+ }
+
+
+-inline static void siftdown( table_t *t, unsigned int heapsize, uint4 parent )
++static void siftdown( table_t *t, unsigned int heapsize, uint4 parent )
+ {
+ entry_t *heap = t->heap;
+ unsigned int child = parent*2 + 1;
+@@ -458,21 +449,27 @@
+ return dest;
+ }
+
+-
++/**
++* this function extract all n-gram from past buffer and put them into the table "t"
++* [modified] by Jocelyn Merand to accept utf-8 multi-character symbols to be used in OpenOffice
++*/
+ static void createngramtable( table_t *t, const char *buf )
+ {
+ char n[MAXNGRAMSIZE+1];
+ const char *p = buf;
+ int i;
++ int pointer = 0;
+
+ /*** Get all n-grams where 1<=n<=MAXNGRAMSIZE. Allow underscores only at borders. ***/
+- for (;;p++) {
++ while(1) {
+
+- const char *q = p;
++ const char *q = &p[pointer]; /*[modified] previously p++ above (for(;;p++)) now, it's pointer wich is increased so we have to get the new pointer on the buffer*/
+ char *m = n;
+
+ /*** First char may be an underscore ***/
+- *m++ = *q++;
++ int decay = charcopy(q, m); /*[modified] previously *q++ = *m++*/
++ q = &(p[pointer+decay]); /*[modified] the old copying method do not manage multi-character symbols*/
++ m += decay; /*[modified]*/
+ *m = '\0';
+
+ increasefreq( t, n, 1 );
+@@ -482,19 +479,22 @@
+ }
+
+ /*** Let the compiler unroll this ***/
+- for ( i=2; i<=MAXNGRAMSIZE; i++) {
++ for ( i=2; i<=MAXNGRAMSYMBOL; i++) {
+
+- *m++ = *q;
++ decay = charcopy(q, m); /*[modified] like above*/
++ m += decay;
+ *m = '\0';
+
+ increasefreq( t, n, i );
+
+ if ( *q == '_' ) break;
+- q++;
++ q += decay;
+ if ( *q == '\0' ) {
+ return;
+ }
+ }
++
++ pointer = nextcharstart(p,pointer); /*[modified] p[pointer] must point on the next start of symbol, but whith utf next start is not surely next char*/
+ }
+ return;
+ }
Property changes on: trunk/textproc/libtextcat/files/patch-src_fingerprint.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/textproc/libtextcat/files/patch-src_fingerprint.h
===================================================================
--- trunk/textproc/libtextcat/files/patch-src_fingerprint.h (rev 0)
+++ trunk/textproc/libtextcat/files/patch-src_fingerprint.h 2015-10-10 20:16:47 UTC (rev 20508)
@@ -0,0 +1,16 @@
+--- ./src/fingerprint.h.orig 2003-05-19 14:16:31.000000000 +0200
++++ ./src/fingerprint.h 2010-12-21 16:18:55.000000000 +0100
+@@ -41,7 +41,13 @@
+ extern int fp_Read( void *handle, const char *fname, int maxngrams );
+ extern sint4 fp_Compare( void *cat, void *unknown, int cutoff );
+ extern void fp_Show( void *handle );
++#ifdef __cplusplus
++extern "C" {
++#endif
+ extern const char *fp_Name( void *handle );
++#ifdef __cplusplus
++}
++#endif
+ extern void fp_Print( void *handle, FILE *fp );
+
+ #endif
Property changes on: trunk/textproc/libtextcat/files/patch-src_fingerprint.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/textproc/libtextcat/files/patch-src_textcat.c
===================================================================
--- trunk/textproc/libtextcat/files/patch-src_textcat.c (rev 0)
+++ trunk/textproc/libtextcat/files/patch-src_textcat.c 2015-10-10 20:16:47 UTC (rev 20508)
@@ -0,0 +1,97 @@
+--- src/textcat.c.orig Thu May 22 13:32:43 2003
++++ src/textcat.c Thu Aug 23 22:47:07 2007
+@@ -74,6 +74,7 @@
+ typedef struct {
+
+ void **fprint;
++ char *fprint_disable;
+ uint4 size;
+ uint4 maxsize;
+
+@@ -112,11 +113,21 @@
+ fp_Done( h->fprint[i] );
+ }
+ wg_free( h->fprint );
++ wg_free( h->fprint_disable );
+ wg_free( h );
+
+ }
+
+-extern void *textcat_Init( const char *conffile )
++/** Replaces older function */
++extern void *textcat_Init( const char *conffile ){
++ return special_textcat_Init( conffile, DEFAULT_FINGERPRINTS_PATH );
++}
++
++/**
++ * Originaly this function had only one parameter (conffile) it has been modified since OOo use
++ * Basicaly prefix is the directory path where fingerprints are stored
++ */
++extern void *special_textcat_Init( const char *conffile, const char *prefix )
+ {
+ textcat_t *h;
+ char line[1024];
+@@ -134,11 +145,13 @@
+ h->size = 0;
+ h->maxsize = 16;
+ h->fprint = (void **)wg_malloc( sizeof(void*) * h->maxsize );
++ h->fprint_disable = (char *)wg_malloc( sizeof(char*) * h->maxsize ); /*added to store the state of languages*/
+
+ while ( wg_getline( line, 1024, fp ) ) {
+ char *p;
+ char *segment[4];
+- int res;
++ char finger_print_file_name[512];
++ int res;
+
+ /*** Skip comments ***/
+ #ifdef HAVE_STRCHR
+@@ -156,17 +169,23 @@
+ /*** Ensure enough space ***/
+ if ( h->size == h->maxsize ) {
+ h->maxsize *= 2;
+- h->fprint = (void *)wg_realloc( h->fprint, sizeof(void*) * h->maxsize );
++ h->fprint = (void **)wg_realloc( h->fprint, sizeof(void*) * h->maxsize );
++ h->fprint_disable = (char *)wg_realloc( h->fprint_disable, sizeof(char*) * h->maxsize );
+ }
+
+ /*** Load data ***/
+ if ((h->fprint[ h->size ] = fp_Init( segment[1] ))==NULL) {
+ goto ERROR;
+ }
+- if ( fp_Read( h->fprint[h->size], segment[0], 400 ) == 0 ) {
++ finger_print_file_name[0] = '\0';
++ strcat(finger_print_file_name, prefix);
++ strcat(finger_print_file_name, segment[0]);
++
++ if ( fp_Read( h->fprint[h->size], finger_print_file_name, 400 ) == 0 ) {
+ textcat_Done(h);
+ goto ERROR;
+- }
++ }
++ h->fprint_disable[h->size] = 0xF0; /*0xF0 is the code for enabled languages, 0x0F is for disabled*/
+ h->size++;
+ }
+
+@@ -203,11 +222,18 @@
+ result = _TEXTCAT_RESULT_SHORT;
+ goto READY;
+ }
+-
++
+ /*** Calculate the score for each category. ***/
+ for (i=0; i<h->size; i++) {
+- int score = fp_Compare( h->fprint[i], unknown, threshold );
+- candidates[i].score = score;
++ int score;
++ if(h->fprint_disable[i] & 0x0F){ /*if this language is disabled*/
++ score = MAXSCORE;
++ }
++ else{
++ score = fp_Compare( h->fprint[i], unknown, threshold );
++ /*printf("Score for %s : %i\n", fp_Name(h->fprint[i]), score);*/
++ }
++ candidates[i].score = score;
+ candidates[i].name = fp_Name( h->fprint[i] );
+ if ( score < minscore ) {
+ minscore = score;
Property changes on: trunk/textproc/libtextcat/files/patch-src_textcat.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/textproc/libtextcat/files/patch-src_textcat.h
===================================================================
--- trunk/textproc/libtextcat/files/patch-src_textcat.h (rev 0)
+++ trunk/textproc/libtextcat/files/patch-src_textcat.h 2015-10-10 20:16:47 UTC (rev 20508)
@@ -0,0 +1,41 @@
+--- ./src/textcat.h.orig 2003-05-19 14:16:31.000000000 +0200
++++ ./src/textcat.h 2010-12-21 16:18:55.000000000 +0100
+@@ -39,6 +39,9 @@
+
+ #define _TEXTCAT_RESULT_UNKOWN "UNKNOWN"
+ #define _TEXTCAT_RESULT_SHORT "SHORT"
++#ifdef __cplusplus
++extern "C" {
++#endif
+
+
+ /**
+@@ -51,10 +54,19 @@
+ * Returns: handle on success, NULL on error. (At the moment, the
+ * only way errors can occur, is when the library cannot read the
+ * conffile, or one of the fingerprint files listed in it.)
++ *
++ * Replace older function (and has exacly the same behaviour)
++ * see below
+ */
+ extern void *textcat_Init( const char *conffile );
+
+ /**
++ * Originaly this function had only one parameter (conffile) it has been modified since OOo must be able to load alternativ DB
++ * Basicaly prefix is the directory path where fingerprints are stored
++ */
++extern void *special_textcat_Init( const char *conffile, const char *prefix );
++
++/**
+ * textcat_Done() - Free up resources for handle
+ */
+ extern void textcat_Done( void *handle );
+@@ -77,4 +89,8 @@
+ * textcat_Version() - Returns a string describing the version of this classifier.
+ */
+ extern char *textcat_Version();
++
++#ifdef __cplusplus
++}
++#endif
+ #endif
Property changes on: trunk/textproc/libtextcat/files/patch-src_textcat.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/textproc/libtextcat/files/patch-src_utf8misc.c
===================================================================
--- trunk/textproc/libtextcat/files/patch-src_utf8misc.c (rev 0)
+++ trunk/textproc/libtextcat/files/patch-src_utf8misc.c 2015-10-10 20:16:47 UTC (rev 20508)
@@ -0,0 +1,135 @@
+--- /dev/null Thu Aug 23 22:58:13 2007
++++ src/utf8misc.c Thu Aug 23 22:47:07 2007
+@@ -0,0 +1,132 @@
++/***************************************************************************
++ * Copyright (C) 2006 by Jocelyn Merand *
++ * joc.mer at gmail.com *
++ * *
++ * THE BSD LICENSE
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * - Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above copyright
++ * notice, this list of conditions and the following disclaimer in the
++ * documentation and/or other materials provided with the
++ * distribution.
++ *
++ * - Neither the name of the WiseGuys Internet B.V. nor the names of
++ * its contributors may be used to endorse or promote products derived
++ * from this software without specific prior written permission.
++ *
++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
++ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
++ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
++ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
++ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
++ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ ***************************************************************************/
++
++#ifndef _UTF8_MISC_H_
++#include "utf8misc.h"
++#endif
++
++
++int nextcharstart(const char *str, int position){
++ int pointer = position;
++
++ if(str[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/
++
++ /*then str[pointer] is an escape character*/
++
++ char escape_char = ((str[pointer] & WEIGHT_MASK) << 1); /*and we use it to count (by bit translation) following characters (only the weightest part)*/
++
++ while(escape_char & ESCAPE_MASK && str[pointer]){/*every step, we move the byte of 1 bit left, when first bit is 0, it's finished*/
++ escape_char = escape_char <<1;
++ ++pointer;
++ }
++ }
++ if(str[pointer]){ /*finaly, if we are not on the \0 character, we jump to the next character*/
++ ++pointer;
++ }
++ return pointer;
++}
++
++
++int charcopy(const char *str, char *dest){
++
++ int pointer = 0;
++ if(str[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/
++
++ /*then str[pointer] is an escape character*/
++
++ char escape_char = ((str[pointer] & WEIGHT_MASK) << 1); /*and we use it to count following characters (only the weightest part)*/
++
++ while(escape_char & ESCAPE_MASK && str[pointer]){ /*every step, we move the byte of 1 bit left, when first bit is 0, it's finished*/
++ dest[pointer] = str[pointer];
++ escape_char = escape_char <<1;
++ ++pointer;
++ }
++ }
++ if(str[pointer]){
++ dest[pointer] = str[pointer];
++ ++pointer;
++ }
++
++ return pointer;
++}
++
++
++int issame( char *lex, char *key, int len )
++{
++ /*printf("[%s] prefix of [%s] with length %i", lex, key, len);*/
++ int char_counter = 0;
++ int pointer = 0;
++ while(char_counter < len) {
++
++ if(key[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/
++
++ /*then key[pointer] is an escap character*/
++
++ char escape_char = ((key[pointer] & WEIGHT_MASK) << 1); /*and we use it to count (only the weightest part)*/
++
++ while(escape_char & ESCAPE_MASK && key[pointer] == lex[pointer] ){
++ escape_char = escape_char <<1;
++ ++pointer;
++ }
++ }
++ ++char_counter; /*and we are on a new utf8 character*/
++ if ( key[pointer] != lex[pointer] ) {
++ return 0;
++ /*printf(" NO\n", lex, key, len);*/
++ }
++ ++pointer;
++ }
++ if ( lex[pointer] != '\0' ) {
++ return 0;
++ /*printf(" NO\n");*/
++ }
++
++ /*printf(" YES\n");*/
++
++ return 1;
++}
++
++
++extern int utfstrlen(const char* str){
++ int char_counter = 0;
++ int pointer = 0;
++ while(str[pointer]) {
++ pointer = nextcharstart(str, pointer);
++
++ ++char_counter; /*and we are on a new utf8 character*/
++ }
++ return char_counter;
++}
++
Property changes on: trunk/textproc/libtextcat/files/patch-src_utf8misc.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/textproc/libtextcat/files/patch-src_utf8misc.h
===================================================================
--- trunk/textproc/libtextcat/files/patch-src_utf8misc.h (rev 0)
+++ trunk/textproc/libtextcat/files/patch-src_utf8misc.h 2015-10-10 20:16:47 UTC (rev 20508)
@@ -0,0 +1,91 @@
+--- /dev/null Thu Aug 23 22:58:13 2007
++++ src/utf8misc.h Thu Aug 23 22:47:08 2007
+@@ -0,0 +1,88 @@
++/***************************************************************************
++ * Copyright (C) 2006 by Jocelyn Merand *
++ * joc.mer at gmail.com *
++ * *
++ * THE BSD LICENSE
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * - Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above copyright
++ * notice, this list of conditions and the following disclaimer in the
++ * documentation and/or other materials provided with the
++ * distribution.
++ *
++ * - Neither the name of the WiseGuys Internet B.V. nor the names of
++ * its contributors may be used to endorse or promote products derived
++ * from this software without specific prior written permission.
++ *
++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
++ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
++ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
++ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
++ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
++ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ ***************************************************************************/
++
++#ifndef _UTF8_MISC_H_
++#define _UTF8_MISC_H_
++
++/**
++ * These variables are used in character processing functions
++ * These have been added to manage utf-8 symbols, particularly escape chars
++ */
++#ifdef _UTF8_
++#define ESCAPE_MASK 0x80
++#define WEIGHT_MASK 0xF0
++#else
++#define ESCAPE_MASK 0xFF
++#define WEIGHT_MASK 0x00
++#endif
++
++
++/*
++ * Is used to jump to the next start of char
++ * of course it's only usefull when encoding is utf-8
++ * This function have been added by Jocelyn Merand to use libtextcat in OOo
++ */
++int nextcharstart(const char *str, int position);
++
++
++/*Copy the char in str to dest
++ * of course it's only usefull when encoding is utf8 and the symbol is encoded with more than 1 char
++ * return the number of char jumped
++ * This function have been added by Jocelyn Merand to use libtextcat in OOo
++ */
++int charcopy(const char *str, char *dest);
++
++
++/* checks if n-gram lex is a prefix of key and of length len
++* if _UTF8_ is defined, it uses escap characters and len is not realy the length of lex
++* in this case, len is the number of utf-8 char strlen("€") == 3 but len == 1
++*/
++int issame( char *lex, char *key, int len );
++
++
++/* Counts the number of characters
++* if _UTF8_ is defined, it uses escap characters and the result is not realy the length of str
++* in this case, the result is the number of utf-8 char strlen("€") == 3 but utfstrlen("€") == 1
++*/
++#ifdef __cplusplus
++extern "C" {
++#endif
++extern int utfstrlen(const char* str);
++#ifdef __cplusplus
++}
++#endif
++
++#endif
++
Property changes on: trunk/textproc/libtextcat/files/patch-src_utf8misc.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/textproc/libtextcat/pkg-descr
===================================================================
--- trunk/textproc/libtextcat/pkg-descr (rev 0)
+++ trunk/textproc/libtextcat/pkg-descr 2015-10-10 20:16:47 UTC (rev 20508)
@@ -0,0 +1,17 @@
+Libtextcat is a library with functions that implement the classification
+technique described in Cavnar & Trenkle, "N-Gram-Based Text Categorization" [1].
+It was primarily developed for language guessing, a task on which it is known to
+perform with near-perfect accuracy.
+
+The central idea of the Cavnar & Trenkle technique is to calculate a
+"fingerprint" of a document with an unknown category, and compare this with the
+fingerprints of a number of documents of which the categories are known. The
+categories of the closest matches are output as the classification. A
+fingerprint is a list of the most frequent n-grams occurring in a document,
+ordered by frequency. Fingerprints are compared with a simple out-of-place
+metric.
+
+[1] The document that started it all: William B. Cavnar & John M. Trenkle (1994)
+N-Gram-Based Text Categorization, <http://citeseer.ist.psu.edu/68861.html>.
+
+WWW: http://software.wise-guys.nl/libtextcat/
Property changes on: trunk/textproc/libtextcat/pkg-descr
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/textproc/libtextcat/pkg-plist
===================================================================
--- trunk/textproc/libtextcat/pkg-plist (rev 0)
+++ trunk/textproc/libtextcat/pkg-plist 2015-10-10 20:16:47 UTC (rev 20508)
@@ -0,0 +1,90 @@
+bin/createfp
+include/libtextcat/common.h
+include/libtextcat/config.h
+include/libtextcat/constants.h
+include/libtextcat/fingerprint.h
+include/libtextcat/textcat.h
+include/libtextcat/utf8misc.h
+include/libtextcat/wg_mempool.h
+include/textcat.h
+lib/libtextcat.a
+lib/libtextcat.so
+lib/libtextcat.so.0
+lib/libtextcat.so.0.0.0
+%%DATADIR%%/LM/afrikaans.lm
+%%DATADIR%%/LM/albanian.lm
+%%DATADIR%%/LM/amharic-utf.lm
+%%DATADIR%%/LM/arabic-iso8859_6.lm
+%%DATADIR%%/LM/arabic-windows1256.lm
+%%DATADIR%%/LM/armenian.lm
+%%DATADIR%%/LM/basque.lm
+%%DATADIR%%/LM/belarus-windows1251.lm
+%%DATADIR%%/LM/bosnian.lm
+%%DATADIR%%/LM/breton.lm
+%%DATADIR%%/LM/bulgarian-iso8859_5.lm
+%%DATADIR%%/LM/catalan.lm
+%%DATADIR%%/LM/chinese-big5.lm
+%%DATADIR%%/LM/chinese-gb2312.lm
+%%DATADIR%%/LM/croatian-ascii.lm
+%%DATADIR%%/LM/czech-iso8859_2.lm
+%%DATADIR%%/LM/danish.lm
+%%DATADIR%%/LM/drents.lm
+%%DATADIR%%/LM/dutch.lm
+%%DATADIR%%/LM/english.lm
+%%DATADIR%%/LM/esperanto.lm
+%%DATADIR%%/LM/estonian.lm
+%%DATADIR%%/LM/finnish.lm
+%%DATADIR%%/LM/french.lm
+%%DATADIR%%/LM/frisian.lm
+%%DATADIR%%/LM/georgian.lm
+%%DATADIR%%/LM/german.lm
+%%DATADIR%%/LM/greek-iso8859-7.lm
+%%DATADIR%%/LM/hebrew-iso8859_8.lm
+%%DATADIR%%/LM/hindi.lm
+%%DATADIR%%/LM/hungarian.lm
+%%DATADIR%%/LM/icelandic.lm
+%%DATADIR%%/LM/indonesian.lm
+%%DATADIR%%/LM/irish.lm
+%%DATADIR%%/LM/italian.lm
+%%DATADIR%%/LM/japanese-euc_jp.lm
+%%DATADIR%%/LM/japanese-shift_jis.lm
+%%DATADIR%%/LM/korean.lm
+%%DATADIR%%/LM/latin.lm
+%%DATADIR%%/LM/latvian.lm
+%%DATADIR%%/LM/lithuanian.lm
+%%DATADIR%%/LM/malay.lm
+%%DATADIR%%/LM/manx.lm
+%%DATADIR%%/LM/marathi.lm
+%%DATADIR%%/LM/middle_frisian.lm
+%%DATADIR%%/LM/mingo.lm
+%%DATADIR%%/LM/nepali.lm
+%%DATADIR%%/LM/norwegian.lm
+%%DATADIR%%/LM/persian.lm
+%%DATADIR%%/LM/polish.lm
+%%DATADIR%%/LM/portuguese.lm
+%%DATADIR%%/LM/quechua.lm
+%%DATADIR%%/LM/romanian.lm
+%%DATADIR%%/LM/rumantsch.lm
+%%DATADIR%%/LM/russian-iso8859_5.lm
+%%DATADIR%%/LM/russian-koi8_r.lm
+%%DATADIR%%/LM/russian-windows1251.lm
+%%DATADIR%%/LM/sanskrit.lm
+%%DATADIR%%/LM/scots.lm
+%%DATADIR%%/LM/scots_gaelic.lm
+%%DATADIR%%/LM/serbian-ascii.lm
+%%DATADIR%%/LM/slovak-ascii.lm
+%%DATADIR%%/LM/slovak-windows1250.lm
+%%DATADIR%%/LM/slovenian-ascii.lm
+%%DATADIR%%/LM/slovenian-iso8859_2.lm
+%%DATADIR%%/LM/spanish.lm
+%%DATADIR%%/LM/swahili.lm
+%%DATADIR%%/LM/swedish.lm
+%%DATADIR%%/LM/tagalog.lm
+%%DATADIR%%/LM/tamil.lm
+%%DATADIR%%/LM/thai.lm
+%%DATADIR%%/LM/turkish.lm
+%%DATADIR%%/LM/ukrainian-koi8_r.lm
+%%DATADIR%%/LM/vietnamese.lm
+%%DATADIR%%/LM/welsh.lm
+%%DATADIR%%/LM/yiddish-utf.lm
+%%DATADIR%%/conf.txt
Property changes on: trunk/textproc/libtextcat/pkg-plist
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
More information about the Midnightbsd-cvs
mailing list