use strict; use warnings; $INC{'Encode/ConfigLocal.pm'}=1; use Encode; eval { local $SIG{__DIE__}; # silence weechat die handler require Encode::HanExtra }; # more chinese eval { local $SIG{__DIE__}; require Encode::JIS2K }; # more japanese use Time::Local; # luanma.pl is written by Nei # and licensed under the under GNU General Public License v3 # or any later version # to read the following docs, you can use "perldoc luanma.pl" =head1 NAME luanma - store more info about encoding of message, and change it with update (weechat edition) =head1 SYNOPSIS more help for charset troubles command is called /lma see "/help lma" for usage =head1 DESCRIPTION luanma will allow you to view received messages as they would appear when decoded using different charsets. you might know this feature from your webbrowser. it is useful if you need to understand a message that was received, but it looks garbled because the sender used different charset than you. as usual, a list of charsets can be defined that will be tried in consecution until a successful decode is made. the charset can be choosen differently for different times in the past and based on nick and weechat buffer. furthermore, you can use /debug tags to see which charset was used to decode a message. a table of charset rules will be saved to luanma.conf and can be edited with /lma set command. =head1 CAVEATS =over =item * the automatic encoding of outgoing notices is visible on display and no assessment of success is given, because weechat does not set the appropriate tag on outgoing notices =item * in order to not convert all messages as raw, only high bit data (extended ascii) is encoded. that means the script works fine for latin variants and utf8, but B for any 7bit-clean encoding or for national EBCDIC =item * colours might get mixed up with colorize_nicks script when there is a nick with the same name as a 2-hex-character encoding (example: "b2"). One possible workaround is to turn off greedy_matching in colorize_nicks =item * no encoding is done when outgoing charset is specified as 'utf8' =back =head1 BUGS =over =item * splitting of messages is not supported, so if byte-length of message in utf8 exceeds 510, it will get split by weechat. result is that only the first part is encoded properly. if byte-length of B message exceeds 510, then it will usually get cut off (exact behaviour depends on IRC server) =item * the prefix on ACTION messages ('/me') gets recoded for messages that you send yourself. this will cause problems with utf8 nicknames where supported =back =head1 SETTINGS the settings are usually found in the plugins.var.perl.luanma namespace, that is, type /set plugins.var.perl.luanma.* to see them and /set plugins.var.perl.luanma.SETTINGNAME VALUE to change a setting C to a new value C. Finally, /unset plugins.var.perl.luanma.SETTINGNAME will reset a setting to its default value. the following settings are available: =head2 tags white-space separated list of irc_(in_) tags to store raw messages of (only those can be recoded). see /debug tags =head2 encode_warn add a warning message into the line displayed on your buffer, when encoding of outgoing messages fails/is lossy =head2 parser parser to use for line parsing. valid options: ondemand, async, full. ondemand will parse lines when displayed on screen (needs parse on every buffer switch, but fast on load). async and full do not need to parse lines when switching buffers, but WILL FREEZE your weechat on /script (re)load and /upgrade. be careful. async uses timers to do the parsing which should make it less likely for you to drop network connection. full will do the parse in one swipe, so it is faster and the freeze is of shorter duration. =cut use constant SCRIPT_NAME => 'luanma'; weechat::register(SCRIPT_NAME, 'Nei ', '0.2', 'GPL3', 'more flexibility with incoming charset', 'stop_luanma', '') || return; sub SCRIPT_FILE() { my $infolistptr = weechat::infolist_get('perl_script', '', SCRIPT_NAME); my $filename = weechat::infolist_string($infolistptr, 'filename') if weechat::infolist_next($infolistptr); weechat::infolist_free($infolistptr); return $filename unless @_; } { package Nlib; # this is a weechat perl library use strict; use warnings; no warnings 'redefine'; ## hdh -- hdata helper ## $_[0] - arg pointer or hdata list name ## $_[1] - hdata name ## $_[2..$#_] - hdata variable name ## $_[-1] - hashref with key/value to update (optional) ## returns value of hdata, and hdata name in list ctx, or number of variables updated sub hdh { if (@_ > 1 && $_[0] !~ /^0x/ && $_[0] !~ /^\d+$/) { my $arg = shift; unshift @_, weechat::hdata_get_list(weechat::hdata_get($_[0]), $arg); } while (@_ > 2) { my ($arg, $name, $var) = splice @_, 0, 3; my $hdata = weechat::hdata_get($name); unless (ref $var eq 'HASH') { $var =~ s/!(.*)/weechat::hdata_get_string($hdata, $1)/e; (my $plain_var = $var) =~ s/^\d+\|//; my $type = weechat::hdata_get_var_type_string($hdata, $plain_var); if ($type eq 'pointer') { my $name = weechat::hdata_get_var_hdata($hdata, $var); unshift @_, $name if $name; } my $fn = "weechat::hdata_$type"; unshift @_, do { no strict 'refs'; &$fn($hdata, $arg, $var) }; } else { return weechat::hdata_update($hdata, $arg, $var); } } wantarray ? @_ : $_[0] } ## hook_dynamic -- weechat::hook something and store hook reference ## $hook_call - hook type (e.g. modifier) ## $what - event type to hook (depends on $hook_call) ## $sub - subroutine name to install ## @params - parameters sub hook_dynamic { my ($hook_call, $what, $sub, @params) = @_; my $caller_package = (caller)[0]; eval qq{ package $caller_package; no strict 'vars'; \$DYNAMIC_HOOKS{\$what}{\$sub} = weechat::hook_$hook_call(\$what, \$sub, \@params) unless exists \$DYNAMIC_HOOKS{\$what} && exists \$DYNAMIC_HOOKS{\$what}{\$sub}; }; die $@ if $@; } ## unhook_dynamic -- weechat::unhook something where hook reference has been stored with hook_dynamic ## $what - event type that was hooked ## $sub - subroutine name that was installed sub unhook_dynamic { my ($what, $sub) = @_; my $caller_package = (caller)[0]; eval qq{ package $caller_package; no strict 'vars'; weechat::unhook(\$DYNAMIC_HOOKS{\$what}{\$sub}) if exists \$DYNAMIC_HOOKS{\$what} && exists \$DYNAMIC_HOOKS{\$what}{\$sub}; delete \$DYNAMIC_HOOKS{\$what}{\$sub}; delete \$DYNAMIC_HOOKS{\$what} unless \%{\$DYNAMIC_HOOKS{\$what}}; }; die $@ if $@; } sub fu8on(@) { Encode::_utf8_on($_) for @_; wantarray ? @_ : shift } use Pod::Select qw(); use Pod::Simple::TextContent; ## get_desc_from_pod -- return setting description from pod documentation ## $file - filename with pod ## $setting - name of setting ## returns description as text sub get_desc_from_pod { my $file = shift; return unless -s $file; my $setting = shift; open my $pod_sel, '>', \my $ss; Pod::Select::podselect({ -output => $pod_sel, -sections => ["SETTINGS/$setting"]}, $file); my $pt = new Pod::Simple::TextContent; $pt->output_string(\my $ss_f); $pt->parse_string_document($ss); my ($res) = $ss_f =~ /^\s*\Q$setting\E\s+(.*)\s*/; $res } ## get_settings_from_pod -- retrieve all settings in settings section of pod ## $file - file with pod ## returns list of all settings sub get_settings_from_pod { my $file = shift; return unless -s $file; open my $pod_sel, '>', \my $ss; Pod::Select::podselect({ -output => $pod_sel, -sections => ["SETTINGS//!.+"]}, $file); $ss =~ /^=head2\s+(.*)\s*$/mg } 1 } use constant CMD_NAME => 'lma'; our @nags; our $nag_tag; our %nag_modifiers; our $CFG_FILE_NAME = weechat::info_get('weechat_dir', '').weechat::info_get('dir_separator', '').SCRIPT_NAME.'.conf'; our (@CFG_TABLE, @CFG_TABLE_2); our @STO = (\(our (%BYTE_MSGS, %ESC_MSG, %MSG_TIME, %MSG_BUF, %MSG_NICK, %MSG_ENC, %MSG_FLT, %MSG_COLOR))); our (@ENCODE_TABLE, @ENCODE_TABLE2); our %DEC; our $GC_COUNT; our $GC_LIMIT = 10_000; our $PARSE_STATS = 987; our $ASYNC_PARSE = $PARSE_STATS; our %ASYNC_BUF; our $ASYNC_TIMER; our @mon = qw(jan feb mar apr may jun jul aug sep oct nov dec); our %mon = do { my $i = 0; map { $_ => $i++ } @mon }; our $mon_re = join '|', @mon; ## esc1 -- escape all endangered characters ## @_ - strings to modify sub esc1 { for (@_) { # need to fix up escape here, weechat kills it # see grep { weechat::string_remove_color(chr $_, "") ne chr $_ } (000..0177) # our escape bracket is 020 s/([^\000-\017\021-\030\035-\175\177])/sprintf "\020%x\020", ord $1/ge; } } ## esc_only -- message needs no recode ## $_[0] - message string to check ## returns bool sub esc_only { $_[0] !~ /[^\000-\032\034-\175\177]/ } init_luanma(); weechat::hook_config('plugins.var.perl.'.SCRIPT_NAME.'.*', 'default_options', ''); weechat::hook_signal('buffer_line_added', 'line_sig', ''); weechat::hook_signal('upgrade', 'restore_lines', ''); weechat::hook_modifier('input_text_for_buffer', 'auto_encode_mod', ''); weechat::hook_command(CMD_NAME, 'a better /charset', (join ' || ', 'list', 'set [] [-out ]', 'set -out ', 'del [-g]', 'del -out [-g]', 'save', 'reload', 'list_rules', 'gc', 'forget -yes', ), (join "\n", 'without arguments, the list of keys is displayed', '', ' list: show list of current recode rules', ' set: adds or modifies a recode rule', ' del: delete one or many recode rules', ' save: save rules to config file', ' reload: reload rules from config file', 'list_rules: list internal rules and pointers (for debug and /debug tags)', ' gc: remove raw lines from cache that are no longer valid in weechat (this is also done autoatically)', " forget: forget everything about messages, forget all raw and all ESC messages. be careful, /@{[CMD_NAME]} cannot be used anymore after youx do this!", '', ' ts: timestamp at which the rule starts to become effective', ' the following time specifications are supported:', ' 1357986420: unix timestamp as output by `date +%s\' (used by weechat internally)', ' -59 | -59m: relative time, 59 seconds/minutes /', ' -23h | -1d: 23 hours / 1 day ago', ' HH:MM:SS : time (hour:minutes:seconds)', ' Jan01 : date (format: MonDD)', ' Jan0100:00:00 = midnight on January 1st.', ' 1: from the beginning on', ' 0: starting from now', ' the "del" command additionally supports these specifiers:', ' *: any time', ' >ts: rule effective after ts', ' length $a } keys %DEC; if ($s =~ s/^\02010\020/\020/) {} elsif ($s =~ s/^\020($codecs)\020//) { $s = $DEC{$1}->encode($s, Encode::FB_DEFAULT); # must make best effort here } elsif ($p =~ /^PRIVMSG/ && $s =~ /^(\01ACTION )\020($codecs)\020(.*)(\01)$/) { # /me $s = "\01ACTION ".$DEC{$2}->encode($3, Encode::FB_DEFAULT)."\01"; # avoid upgrade } else { return $_[3] } "$p$x$s" } ## auto_encode_mod -- add encoding prefixes to buffer input line ## () - modifier handler ## $_[1] - modifier ## $_[3] - content of line before sending sub auto_encode_mod { # XXX should do the splitting my $in = Nlib::fu8on(weechat::string_input_for_buffer($_[3])); return $_[3] unless $in; # pass through commands return $_[3] unless exists $nag_modifiers{privmsg}; return $_[3] if $in =~ /^\020/; # already marked my $buf = Nlib::hdh($_[2], 'buffer', 'name'); my ($r) = grep { $buf =~ $_->{buf_re} && $in =~ $_->{pat_re} } @ENCODE_TABLE2; return $_[3] unless $r; return $_[3] if $r->{_}{charset} eq 'utf8'; # XXX "\020${$r}{_}{charset}\020$in" } ## auto_encode_cmd -- add tag to command for encode marker ## () - command_run handler ## $_[0] - forward to which command ## $_[1] - buffer pointer ## $_[2] - command sub auto_encode_cmd { # XXX should do the splitting Encode::_utf8_on($_[2]); #my @args = split ' ', $_[2]; my ($pre, $in, $buf); if ($_[0] eq 'me' && $_[2] =~ /^(\S+\s)(.*)$/i) { ($pre, $in) = ($1, $2); $buf = weechat::buffer_get_string($_[1], 'name'); } elsif ($_[0] eq 'msg' && $_[2] =~ /^(\S+(?:\s+-server\s+(\S+))?\s+(\S+) )(.*)$/i) { my ($srv, $targ) = ($2, $3); ($pre, $in) = ($1, $4); $srv //= weechat::buffer_get_string($_[1], 'localvar_server'); $buf = $targ ne '*' ? "$srv.$targ" : weechat::buffer_get_string($_[1], 'name'); } elsif ($_[0] eq 'query' && $_[2] =~ /^(\S+(?:\s+-server\s+(\S+))?\s+(\S+) )(\s*\S.*)$/i) { my ($srv, $targ) = ($2, $3); ($pre, $in) = ($1, $4); $srv //= weechat::buffer_get_string($_[1], 'localvar_server'); $buf = "$srv.$targ"; } elsif ($_[0] eq 'wallchops' && $_[2] =~ /^(\S+(?:\s+([#&]\S+))? )(.*)$/i) { my $targ = $2; ($pre, $in) = ($1, $3); $buf = $targ ? weechat::buffer_get_string($_[1], 'localvar_server').'.'.$targ : weechat::buffer_get_string($_[1], 'name'); } elsif ($_[0] eq 'topic' && $_[2] !~ /\s-delete\s*$/i && $_[2] =~ /^(\S+(?:\s+([#&\S+]))? )(.*)$/i) { my $targ = $2; ($pre, $in) = ($1, $3); $buf = $targ ? weechat::buffer_get_string($_[1], 'localvar_server').'.'.$targ : weechat::buffer_get_string($_[1], 'name'); } else { return weechat::WEECHAT_RC_OK } return weechat::WEECHAT_RC_OK if $in =~ /^\020/; # already marked my ($r) = grep { $buf =~ $_->{buf_re} && $in =~ $_->{pat_re} } @ENCODE_TABLE2; return weechat::WEECHAT_RC_OK unless $r; return weechat::WEECHAT_RC_OK if $r->{_}{charset} eq 'utf8'; # XXX weechat::command($_[1], "$pre\020${$r}{_}{charset}\020$in"); return weechat::WEECHAT_RC_OK_EAT } ## find_rule -- find rule to recode this line ## $time - timestamp of line ## $buf - buffer name ## $nick - nick ## returns rule if found or undef sub find_rule { my ($time, $buf, $nick) = @_; my ($r) = grep { $_->{_}{time} <= $time && $buf =~ $_->{buf_re} && $nick =~ $_->{nick_re} } @CFG_TABLE_2; $r } ## apply_recode -- recode a line, looking up its rule first ## $lp - pointer to 'line' hdata sub apply_recode { my $lp = shift; my $rule = find_rule($MSG_TIME{$lp}, $MSG_BUF{$lp}, $MSG_NICK{$lp})//\undef; return if $rule == $MSG_FLT{$lp}; my ($s, $e); for my $enc ((($rule == \undef) ? () : @{$rule->{_}{charsets}}), 'x') { $s = $BYTE_MSGS{$lp}; if ($enc eq 'x') { esc1($s); $e = $enc; last; } else { my $enc2 = $enc; my $partial = $enc2 =~ s/!$//; next if $enc2 eq 'hz' && $s =~ /[^\000-\177]/; # hack for hz # put further hacks here... my $t = $DEC{$enc2}->decode($s, Encode::FB_QUIET); # FB_CROAK not reliable #$t =~ s/[[:cntrl:]]//g; if (length $t && !length $s) { # decoding succeeds $s = $t; $e = $enc2; last; } elsif (length $t && $partial) { esc1($s); $s = $t . '' . $s; $e = $enc2 . '_loss'; last; } } } if ($MSG_ENC{$lp} ne $e) { my @line_data = Nlib::hdh((sprintf '0x%x', $lp), 'line', 'data'); my @tags = grep { !/^lma_/ } map { Nlib::hdh(@line_data, "$_|tags_array") } 0 .. Nlib::hdh(@line_data, 'tags_count')-1; my @ctrl_res = split "\0", $MSG_COLOR{$lp}, -1; my $c = 1; $s =~ s/\01+/$ctrl_res[$c++]/g; Nlib::hdh(@line_data, +{ message => $s, tags_array => (join ',', (($e eq 'x') ? () : ("lma_$e", (sprintf 'lma_0x%x', $rule))), @tags), }); $MSG_ENC{$lp} = $e; } $MSG_FLT{$lp} = $rule; } ## line_sig -- decode charset previously replaced and fix up outgoing msgs ## () - signal handler ## $_[2] - line ptr sub line_sig { my @line_data = Nlib::hdh($_[2], 'line', 'data'); my $lp = oct $_[2]; $ASYNC_BUF{$lp} = undef if $ASYNC_TIMER; # we are still in async reread loop, mark this line as seen return weechat::WEECHAT_RC_OK unless Nlib::hdh(@line_data, 'buffer', 'plugin', 'name') eq 'irc'; my @tags = map { Nlib::hdh(@line_data, "$_|tags_array") } 0 .. Nlib::hdh(@line_data, 'tags_count')-1; return weechat::WEECHAT_RC_OK unless grep /$nag_tag/i, @tags; my $message_c = Nlib::hdh(@line_data, 'message'); return weechat::WEECHAT_RC_OK unless $message_c =~ /\020/; my $message = my $message_nc = weechat::string_remove_color($message_c, "\1"); if (defined $_[0] && grep { $_ eq 'no_highlight' } @tags) { # might be own msg, $_[0] == undef on history parsing my $action_pfx_re = qr//; if (grep { $_ eq 'irc_action' } @tags) { # XXX might erroneously recode the in-line prefix (utf8 nicks anyone? ircx?!) $action_pfx_re = qr/\S+ \K/; } my $codecs = join '|', map { quotemeta } sort { length $b <=> length $a } keys %DEC; if ($message =~ /^\02010\020/) {} # fall through elsif ($message =~ s/^$action_pfx_re\020($codecs)\020//) { my $dec = $1; my @ctrl_res; if ($message_nc =~ /\01/) { my $id_control = quotemeta $message_nc; $id_control =~ s/(\\\01)+/(.+?)/g; @ctrl_res = $message_c =~ /^()$id_control()$/; } $message_nc =~ s/^$action_pfx_re\020\Q$dec\E\020//; Encode::_utf8_on($message); my $s = $DEC{$dec}->decode($DEC{$dec}->encode($message, Encode::FB_DEFAULT), Encode::FB_DEFAULT); my $not_equal = $s ne $message; my $c = 1; $s =~ s/\01+/$ctrl_res[$c++]/g; if ($not_equal && weechat::config_string_to_boolean(weechat::config_get_plugin('encode_warn'))) { $s .= ' '.weechat::color('chat_prefix_error').'[warning: lossy encode]'; } Nlib::hdh(@line_data, +{ message => $s, tags_array => (join ',', "lmaout_$dec", ($not_equal ? "lmaout_loss" : ()), @tags), }); return weechat::WEECHAT_RC_OK } } # XXX bad hack: \01* might be sprinkled from colorize_nicks, but will mess up later on color restore $message =~ s/\020\01*([[:xdigit:]]+)\01*\020/chr hex $1/ge || return weechat::WEECHAT_RC_OK; my @ctrl_res; if ($message_nc =~ /\01/) { my $id_control = quotemeta $message_nc; $id_control =~ s/(\\\01)+/(.+?)/g; @ctrl_res = $message_c =~ /^()$id_control()$/; } if (esc_only($message)) { my $c = 1; $message =~ s/\01+/$ctrl_res[$c++]/g; Nlib::hdh(@line_data, +{ message => $message }); $ESC_MSG{$lp} = undef; return weechat::WEECHAT_RC_OK } $BYTE_MSGS{$lp} = $message; $MSG_COLOR{$lp} = join "\0", @ctrl_res; $MSG_TIME{$lp} = 0+Nlib::hdh(@line_data, 'date'); $MSG_BUF{$lp} = Nlib::hdh(@line_data, 'buffer', 'name'); my ($nick_tag) = grep s/^nick_//, @tags; $MSG_NICK{$lp} = $nick_tag//''; $MSG_ENC{$lp} = 'x'; $MSG_FLT{$lp} = \undef; apply_recode($lp); if (defined $GC_LIMIT && ++$GC_COUNT > $GC_LIMIT) { gc_lines('int'); $GC_COUNT = 0; } weechat::WEECHAT_RC_OK } ## hook_encode_commands -- hook irc commands needed to add encode prefix ## - tag name sub hook_encode_commands { if ($_[0] eq 'privmsg') { (weechat::hook_command_run('/me', 'auto_encode_cmd', 'me'), weechat::hook_command_run('/msg', 'auto_encode_cmd', 'msg'), weechat::hook_command_run('/query', 'auto_encode_cmd', 'query'), ) } elsif ($_[0] eq 'notice') { (weechat::hook_command_run('/notice', 'auto_encode_cmd', 'query'), weechat::hook_command_run('/wallchops', 'auto_encode_cmd', 'wallchops'), ) } elsif ($_[0] eq 'topic') { (weechat::hook_command_run('/topic', 'auto_encode_cmd', 'topic'), ) } elsif ($_[0] eq 'part') { (weechat::hook_command_run('/part', 'auto_encode_cmd', 'wallchops'), weechat::hook_command_run('/cycle', 'auto_encode_cmd', 'wallchops'), ) } else { () } } # /lma set