What's new?
|
Help
|
Directory
|
Sign in
lingr
Lingr API
Project Home
Downloads
Wiki
Issues
Source
Checkout
|
Browse
|
Changes
|
Source Path:
svn
/
trunk
/
plugins
/
multilingual_ferret_tools
/
lib
/
multilingual_chunker.rb
r87
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
module MultilingualFerretTools
class Chunker
def initialize(str, options={})
@codepoints = str.unpack("U*")
@u_index = 0
@s_index = 0
@options = options.reverse_merge!({ :whitespace => :discard })
end
def next
return nil if @u_index >= @codepoints.length
start_u_index = @u_index
start_s_index = @s_index
this_classification = Chunker.classify(@codepoints[@u_index])
while @u_index < @codepoints.length - 1
current_classification = Chunker.classify(@codepoints[@u_index + 1])
finished = current_classification != this_classification
finished = false if @options[:whitespace] == :combine and (
(current_classification == :latin_whitespace and this_classification == :latin) or
(current_classification == :latin and this_classification == :latin_whitespace) or
(current_classification == :non_latin_whitespace and this_classification == :non_latin) or
(current_classification == :non_latin and this_classification == :non_latin_whitespace)
)
break if finished
this_classification = current_classification unless (current_classification == :latin_whitespace or current_classification == :non_latin_whitespace)
@u_index += 1
end
@u_index += 1
this_string = @codepoints[start_u_index..@u_index - 1].pack("U*")
@s_index += this_string.length
((this_classification == :latin_whitespace or this_classification == :non_latin_whitespace) and @options[:whitespace] == :discard) ? self.next : [this_string, this_classification, start_s_index, @s_index - 1]
end
def self.classify(thing)
if thing.is_a?(String)
chars = thing.unpack("U*")
found_latin = found_nonlatin = found_latin_whitespace = found_nonlatin_whitespace = false
chars.each do |c|
classification = classify(c)
found_latin ||= classification == :latin
found_nonlatin ||= classification == :non_latin
found_latin_whitespace ||= classification == :latin_whitespace
found_nonlatin_whitespace ||= classification == :non_latin_whitespace
end
found_latin_whitespace && !found_nonlatin_whitespace && !found_latin && !found_nonlatin ? :latin_whitespace :
found_nonlatin_whitespace && !found_latin_whitespace && !found_latin && !found_nonlatin ? :non_latin_whitespace :
found_latin && !found_nonlatin ? :latin : found_nonlatin && !found_latin ? :non_latin : :mixed
elsif thing.is_a?(Integer)
@@LATIN_WHITESPACE.include?(thing) ? :latin_whitespace : @@NON_LATIN_WHITESPACE.include?(thing) ? :non_latin_whitespace : thing < 0x300 ? :latin : :non_latin
else
:unknown
end
end
private
@@LATIN_WHITESPACE = " \t".unpack("c*")
@@NON_LATIN_WHITESPACE = [ 0x3000 ]
@@WHITESPACE = { :latin =>@@LATIN_WHITESPACE, :non_latin => @@NON_LATIN_WHITESPACE }
end
end
Show details
Hide details
Change log
r76
by dburkes on Oct 06, 2008
Diff
move to Google Code repository
Go to:
/trunk/api
/trunk/api/toolkits
/trunk/api/toolkits/javascript
...pi/toolkits/javascript/infoteria
...ts/javascript/infoteria/lingr.js
...pt/infoteria/lingr.js/README.txt
...ript/infoteria/lingr.js/lingr.js
...nfoteria/lingr.js/lingrtest.html
.../infoteria/lingr.js/prototype.js
...kits/javascript/infoteria/simple
...vascript/infoteria/simple/README
...t/infoteria/simple/api_demo.html
/trunk/api/toolkits/ruby
/trunk/api/toolkits/ruby/infoteria
...i/toolkits/ruby/infoteria/README
...its/ruby/infoteria/api_client.rb
...oolkits/ruby/infoteria/botkit.rb
.../ruby/infoteria/botkit_sample.rb
.../infoteria/sample_chat_client.rb
/trunk/plugins
...lugins/multilingual_ferret_tools
...multilingual_ferret_tools/README
...ltilingual_ferret_tools/Rakefile
...ultilingual_ferret_tools/init.rb
...ns/multilingual_ferret_tools/lib
...ols/lib/multilingual_analyzer.rb
...ools/lib/multilingual_chunker.rb
...ools/lib/multilingual_version.rb
...s/multilingual_ferret_tools/test
...st/multilingual_analyzer_test.rb
...est/multilingual_chunker_test.rb
/trunk/plugins/versioned_urls
...lugins/versioned_urls/README.txt
...k/plugins/versioned_urls/init.rb
/trunk/plugins/versioned_urls/lib
...ioned_urls/lib/versioned_urls.rb
/trunk/plugins/versioned_urls/test
...urls/test/versioned_urls_test.rb
Project members,
sign in
to write a code review
Older revisions
All revisions of this file
File info
Size: 2979 bytes, 65 lines
View raw file