Skip to content

Commit

Permalink
fix memory leak
Browse files Browse the repository at this point in the history
use longest match if previous match followed by ignored character
tweak default ignored ascii character to adjust for international character variant
  • Loading branch information
umegaya committed May 22, 2017
1 parent 4d88d88 commit b26efec
Show file tree
Hide file tree
Showing 9 changed files with 104 additions and 129 deletions.
145 changes: 51 additions & 94 deletions src/allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ class IMempool {
virtual void *malloc(size_t sz) = 0;
virtual void free(void *p) = 0;
virtual void *realloc(void *p, size_t sz) = 0;
virtual bool operator == (const IMempool &m) { return true; }
virtual bool operator == (const IMempool &m) { return true; }
virtual bool operator != (const IMempool &m) { return false; }
};
class SystemMempool : public IMempool {
void *malloc(size_t sz) {
Expand All @@ -26,110 +27,66 @@ class SystemMempool : public IMempool {
};

//generate STL allocator from instance of M, which provides
//alloc/free/realloc. heavily based on https://blogs.msdn.microsoft.com/vcblog/2008/08/28/the-mallocator/
template <typename T>
class allocator {
public:
// common part for almost all allocator.
typedef T * pointer;
typedef const T * const_pointer;
typedef T& reference;
typedef const T& const_reference;
typedef T value_type;
typedef size_t size_type;
typedef ptrdiff_t difference_type;

T * address(T& r) const {
return &r;
//alloc/free/realloc. heavily based on http://stackoverflow.com/questions/36517825/is-stephen-lavavejs-mallocator-the-same-in-c11/36521845#36521845
template<class T>
struct allocator {
using value_type = T;
using pointer = T*;
using propagate_on_container_copy_assignment = std::true_type;
using propagate_on_container_move_assignment = std::true_type;
using propagate_on_container_swap = std::true_type;

allocator(IMempool *m) : m_(m) {}
allocator()=delete;
allocator(allocator const &m) {
m_ = m.m_;
}
const T * address(const T& s) const {
return &s;
template<class U>
allocator(allocator<U> const &m) noexcept {
m_ = m.pool_p();
}

size_t max_size() const {
// The following has been carefully written to be independent of
// the definition of size_t and to avoid signed/unsigned warnings.
return (static_cast<size_t>(0) - static_cast<size_t>(1)) / sizeof(T);
allocator& operator=(allocator const &m) {
m_ = m.m_;
return *this;
}

template <typename U> struct rebind {
typedef allocator<U> other;
};

bool operator!=(const allocator& other) const {
return !(*this == other);
template<class U>
allocator& operator=(allocator<U> const &m) noexcept {
m_ = m.pool_p();
return *this;
}

void construct(T * const p, const T& t) const {
void * const pv = static_cast<void *>(p);
new (pv) T(t);

pointer allocate(std::size_t n) {
if (std::size_t(-1) / sizeof(T) < n)
throw std::bad_array_new_length(); // or something else
if (!n) return nullptr; // zero means null, not throw
if(auto*r= static_cast<pointer>(m_->malloc(n * sizeof(T))))
return r;
throw std::bad_alloc();
}

void destroy(T * const p) const; // Defined below.

// Returns true if and only if storage allocated from *this
// can be deallocated from other, and vice versa.
// Always returns true for stateless allocators.
bool operator==(const allocator& other) const {
return m_->operator==(*other.m_);
void deallocate(pointer p, std::size_t n) {
m_->free(p);
}

// Default constructor, copy constructor, rebinding constructor, and destructor.
// Empty for stateless allocators.
allocator(IMempool *m = nullptr) : m_(m == nullptr ? new SystemMempool() : m) {}
allocator(const allocator &a) : m_(&(a.pool())) {}
template <typename U> allocator(const allocator<U> &a) : m_(&(a.pool())) {}
~allocator() {}

// accessor of underlaying mempool
inline IMempool &pool() const { return *m_; }

// The following will be different for each allocator.
T * allocate(const size_t n) const {
// The return value of allocate(0) is unspecified.
// this module returns NULL in order to avoid depending
// on malloc(0)’s implementation-defined behavior
// (the implementation can define malloc(0) to return NULL,
// in which case the bad_alloc check below would fire).
// All allocators can return NULL in this case.
if (n == 0) {
return nullptr;
}

// All allocators should contain an integer overflow check.
// The Standardization Committee recommends that std::length_error
// be thrown in the case of integer overflow.
if (n > max_size()) {
throw std::overflow_error("allocator<T>::allocate() – Integer overflow.");
}

void * const pv = m_->malloc(n * sizeof(T));

// Allocators should throw std::bad_alloc in the case of memory allocation failure.
if (pv == nullptr) {
throw std::bad_alloc();
}
return static_cast<T *>(pv);
template<class U>
bool operator==(allocator<U> const& rhs) const {
return m_->operator==(*rhs.m_);
}

void deallocate(T * const p, const size_t n) const {
// allocator wraps free().
m_->free(p);
template<class U>
bool operator!=(allocator<U> const& rhs) const {
return m_->operator!=(*rhs.m_);
}

// The following will be the same for all allocators that ignore hints.
template <typename U> T * allocate(const size_t n, const U * /* const hint */) const {
//TODO: realloc can be used?
public:
void free(void *p) {
deallocate((pointer)p, sizeof(T));
}
void *malloc(size_t n) {
return allocate(n);
}

private: //prohibit copy
allocator& operator=(const allocator&);
void *realloc(void *p, size_t n) {
return m_->realloc(p, n);
}
IMempool *pool_p() const { return (IMempool *)m_; }
protected:
IMempool *m_;
};

template <typename T> void allocator<T>::destroy(T * const p) const {
p->~T();
}
}

5 changes: 3 additions & 2 deletions src/checker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Checker::~Checker() {
}
}
void Checker::add(const char *s, void *ctx) {
int sz = strnlen(s, MAX_FILTER_STRING);
int sz = (int)strnlen(s, MAX_FILTER_STRING);
if (sz <= 0) { return; }
u8 buf[sz * utf8::MAX_BYTE_PER_GRYPH];
int rlen = checker_->normalize(reinterpret_cast<const u8*>(s), sz, buf, sz * utf8::MAX_BYTE_PER_GRYPH);
Expand All @@ -29,6 +29,7 @@ void Checker::add_alias(const char *target, const char *alias) {
void Checker::ignore_glyphs(const char *glyphs) {
checker_->ignore_glyphs(glyphs);
}
bool Checker::truer(const char *in, int ilen, int start, int count, void *ctx) { return true; }
int Checker::masking(const u8 *in, int ilen, u8 *out, int olen, const char *mask, int mlen) {
int iofs = 0, oofs = 0;
while (ilen > iofs && olen > oofs) {
Expand All @@ -55,7 +56,7 @@ int Checker::masking(const u8 *in, int ilen, u8 *out, int olen, const char *mask
const char *Checker::filter(const char *in, int ilen, char *out, int *olen, const char *mask, ContextChecker checker) {
if (mask == nullptr) { mask = "?"; }
int iofs = 0;
int msz = strnlen(mask, MAX_FILTER_STRING);
int msz = (int)strnlen(mask, MAX_FILTER_STRING);
int oofs = 0, tmp;
void *ctx;
const u8 *iptr = reinterpret_cast<const u8 *>(in);
Expand Down
5 changes: 4 additions & 1 deletion src/checker.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ class Checker {
void ignore_glyphs(const char *glyphs);
inline void add_word(const char *s, void *ctx) { trie_.add(s, ctx); }
inline void remove(const char *s) { trie_.remove(s); }
static bool truer(const char *in, int ilen, int start, int count, void *ctx) { return true; }
static bool truer(const char *in, int ilen, int start, int count, void *ctx);
const char *filter(const char *in, int ilen, char *out, int *olen, const char *mask = nullptr, ContextChecker checker = truer);
bool should_filter(const char *in, int ilen, int *start, int *count, void **pctx = nullptr, ContextChecker checker = truer);
public:
Expand All @@ -68,5 +68,8 @@ class Checker {
static inline void *operator new(std::size_t, void *buf) { return buf; }
static inline void operator delete(void *p, void *buf) {}
static inline void operator delete( void *p ) { std::free(p); }
#if defined(DEBUG)
void dump() { trie_.dump(); }
#endif
};
}
32 changes: 21 additions & 11 deletions src/language.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ int WordChecker::match(const u8 *in, int ilen, const u8 *pattern, int plen, int
int prtmp = utf8::peek(pattern + n_pread, plen - n_pread);
std::memcpy(pout, pattern + n_pread, prtmp);
pout[prtmp] = 0;
auto al = alias_list(reinterpret_cast<const char *>(pout));
const auto &al = alias_list(reinterpret_cast<const char *>(pout));
//TRACE("check with alias: pick %s %s %lu\n", out, pout, al.size());
if (al.size() > 0) {
for (auto &a : al) {
Expand All @@ -69,6 +69,15 @@ int WordChecker::match(const u8 *in, int ilen, const u8 *pattern, int plen, int
break;
}
}
// 最長一致するために、無視される文字が続いていればそれも読んでしまう
while (n_read < ilen) {
int wtmp = utf8::MAX_BYTE_PER_GRYPH;
int rtmp = read_next_with_normalize(in + n_read, ilen - n_read, out, &wtmp);
if (rtmp == 0 || wtmp > 0) { //文字列の終端か、無視されない正規化文字.
break;
}
n_read += rtmp;
}
*ofs = n_read;
return n_pread;
}
Expand Down Expand Up @@ -97,36 +106,37 @@ int WordChecker::normalize(const u8 *in, int ilen, u8 *out, int olen) {
std::placeholders::_1, std::placeholders::_2,
std::placeholders::_3, std::placeholders::_4));
}
void WordChecker::set_alias(const char *pattern, strvec &vec) {
auto i = aliases_map_.find(pattern);
void WordChecker::set_alias(const char *pattern, strvec &vec) {
str s(pattern, pool());
const auto &i = aliases_map_.find(s);
if (i == aliases_map_.end()) {
vec.push_back(pattern);
aliases_map_.emplace(pattern, std::move(vec));
vec.push_back(s);
aliases_map_.emplace(s, vec);
} else {
strvec &v = (*i).second;
std::copy(vec.begin(), vec.end(), std::back_inserter(v));
}
}
void WordChecker::link_alias(const char *pattern1, const char *pattern2) {
if (std::strcmp(pattern1, pattern2) != 0) {
strvec v1(pstr_alloc_); v1.push_back(pattern1);
strvec v2(pstr_alloc_); v2.push_back(pattern2);
strvec v1(pstr_alloc_); v1.push_back(str(pattern1, pool()));
strvec v2(pstr_alloc_); v2.push_back(str(pattern2, pool()));
set_alias(pattern1, v2);
set_alias(pattern2, v1);
}
}
const WordChecker::strvec &WordChecker::alias_list(const char *key) const {
auto i = aliases_map_.find(key);
const auto &i = aliases_map_.find(str(key, pool()));
if (i == aliases_map_.end()) {
static strvec empty_list;
return empty_list;
return empty_list_;
} else {
return (*i).second;
}
}
//エイリアスを追加する.
void WordChecker::add_alias(const char *target, const char *alias) {
strvec v{alias};
strvec v(pool());
v.push_back(str(alias, pool()));
set_alias(target, v);
}
//無視する文字列を追加する
Expand Down
10 changes: 6 additions & 4 deletions src/language.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ namespace language {
class WordChecker : public IMatcher {
public:
typedef std::function<int(const u8 *in, int ilen, u8 *out, int *olen)> normalizer;
typedef std::basic_string<char, std::char_traits<char>, allocator<char>> str;
typedef std::basic_string<char, std::char_traits<char>, allocator<char>> str;
typedef allocator<str> stralloc;
typedef allocator<normalizer> normalloc;
typedef std::vector<str, stralloc> strvec;
Expand All @@ -25,10 +25,12 @@ class WordChecker : public IMatcher {
normalloc norm_alloc_;
svmap aliases_map_;
normvec normalizers_;
strvec empty_list_;
char *ignore_glyphs_;
public:
WordChecker(IMempool *p) : pstr_alloc_(p), pair_alloc_(p), norm_alloc_(p),
aliases_map_(pair_alloc_), normalizers_(norm_alloc_), ignore_glyphs_(nullptr) {
WordChecker(IMempool *p) : pstr_alloc_(p), pair_alloc_(p), norm_alloc_(p),
aliases_map_(pair_alloc_), normalizers_(norm_alloc_), empty_list_(pstr_alloc_),
ignore_glyphs_(nullptr) {
normalizers_.push_back(std::bind(&WordChecker::remove_ignored, this,
std::placeholders::_1, std::placeholders::_2,
std::placeholders::_3, std::placeholders::_4));
Expand All @@ -48,7 +50,7 @@ class WordChecker : public IMatcher {
void set_alias(const char *pattern, strvec &vec);
void link_alias(const char *pattern1, const char *pattern2);
const strvec &alias_list(const char *key) const;
inline IMempool &pool() { return pstr_alloc_.pool(); }
inline stralloc &pool() const { return (stralloc &)pstr_alloc_; }
inline svmap &aliases_map() { return aliases_map_; }
protected:
//internals
Expand Down
10 changes: 5 additions & 5 deletions src/language/jp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@

#include "util.h"
#include "checker.h"
#include "language/jp.h"
#include "jp.h"

namespace shutup {
namespace language {
//チェックの時、無視する文字達.
static const char *ignore_list =
"-+!\"#$%%&()*/,:;<=>?@[\\]^_{|}~ "
".-\"#$%%&()*/,:;<=>?@[\\]^_{|}~ "
"" //half kata hyphen
"、。,.・:;?!゛゜´`¨^ ̄_ヽヾゝゞ〃仝々〆〇ー‐/\~∥|…‥"
"‘’“”()〔〕[]{}〈〉《》「」『』【】+-±×÷=≠<>≦≧∞∴"
Expand All @@ -25,8 +25,8 @@ int JP::init() {
size_t hlen = std::strlen(utf8::jp::hiras), klen = std::strlen(utf8::jp::katas);
int hidx = 0, kidx = 0;
while (kidx < klen && hidx < hlen) {
int htmp = utf8::peek(reinterpret_cast<const u8 *>(utf8::jp::hiras + hidx), hlen - hidx);
int ktmp = utf8::peek(reinterpret_cast<const u8 *>(utf8::jp::katas + kidx), klen - kidx);
int htmp = utf8::peek(reinterpret_cast<const u8 *>(utf8::jp::hiras + hidx), (int)hlen - hidx);
int ktmp = utf8::peek(reinterpret_cast<const u8 *>(utf8::jp::katas + kidx), (int)klen - kidx);
if (htmp == 0 || ktmp == 0) {
break;
}
Expand All @@ -48,7 +48,7 @@ int JP::init() {
void JP::add_synonym(const char *pattern, Checker &c, void *ctx) {
if (utf8::jp::is_kana_string(pattern)) {
//ローマ字変換の登録.まずヘボン式.
int ilen = std::strlen(pattern), olen = ilen;
int ilen = (int)std::strlen(pattern), olen = ilen;
const u8 *in = reinterpret_cast<const u8*>(pattern);
u8 out[ilen];
int r = util::convert(in, ilen, out, olen, std::bind(&utf8::jp::to_hebon_roman,
Expand Down
10 changes: 5 additions & 5 deletions src/language/utiljp.h
Original file line number Diff line number Diff line change
Expand Up @@ -200,9 +200,9 @@ const char *utf8::jp::japan_consonants[] =

bool utf8::jp::is_kana_string(const char *str) {
size_t len = std::strlen(str);
int idx = 0;
size_t idx = 0;
while (len > idx) {
int tmp = peek(reinterpret_cast<const u8 *>(str + idx), len - idx);
int tmp = peek(reinterpret_cast<const u8 *>(str + idx), (int)(len - idx));
if (tmp == 0) {
break;
}
Expand All @@ -227,13 +227,13 @@ static inline int kana_index(const u8 *in, int ilen, int *olen) {
const char *p = std::strstr(utf8::jp::katas, reinterpret_cast<const char *>(buff));
if (p != nullptr) {
*olen = r;
return (p - utf8::jp::katas) / 3;
return (int)(p - utf8::jp::katas) / 3;
} else {
p = std::strstr(utf8::jp::hiras, reinterpret_cast<const char *>(buff));
if (p != nullptr) {
//TRACE("kana_index: match hira: %s %lu\n", p, p - utf8::jp::hiras);
*olen = r;
return (p - utf8::jp::hiras) / 3;
return (int)(p - utf8::jp::hiras) / 3;
}
}
//TRACE("kana_index error no match: %s\n", in);
Expand Down Expand Up @@ -357,7 +357,7 @@ int utf8::jp::to_roman(const u8 *in, int ilen, u8 *out, int *olen, bool assimila
//読み出すバッファがもうない.
}
const char *c = consonant_exception(consonants[consonant_index], vowel_index, consonant_index, assimilated);
int clen = std::strlen(c);
int clen = (int)std::strlen(c);
int total_len = 1 + clen;
//TRACE("store last result %d %d %s %d %d %d\n", vowel_index, consonant_index, c, *olen, clen, total_len);
if (*olen < total_len) {
Expand Down
Loading

0 comments on commit b26efec

Please sign in to comment.