TCMS.pm 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495
  1. package TCMS;
  2. use strict;
  3. use warnings;
  4. no warnings 'experimental';
  5. use feature qw{signatures state};
  6. use Clone qw{clone};
  7. use Date::Format qw{strftime};
  8. use Sys::Hostname();
  9. use HTTP::Body ();
  10. use URL::Encode ();
  11. use Text::Xslate ();
  12. use DateTime::Format::HTTP();
  13. use CGI::Cookie ();
  14. use File::Basename();
  15. use IO::Compress::Gzip();
  16. use Time::HiRes qw{gettimeofday tv_interval};
  17. use HTTP::Parser::XS qw{HEADERS_AS_HASHREF};
  18. use List::Util;
  19. use URI();
  20. use Ref::Util qw{is_coderef is_hashref is_arrayref};
  21. #Grab our custom routes
  22. use FindBin::libs;
  23. use Trog::Routes::HTML;
  24. use Trog::Routes::JSON;
  25. use Trog::Log qw{:all};
  26. use Trog::Log::DBI;
  27. use Trog::Auth;
  28. use Trog::Utils;
  29. use Trog::Config;
  30. use Trog::Data;
  31. use Trog::Vars;
  32. use Trog::FileHandler;
  33. # Troglodyne philosophy - simple as possible
  34. # Wrap app to return *our* error handler instead of Plack::Util::run_app's
  35. my $cur_query = {};
  36. sub app {
  37. return eval { _app(@_) } || do {
  38. my $env = shift;
  39. $env->{'psgi.errors'}->print($@);
  40. # Redact the stack trace past line 1, it usually has things which should not be shown
  41. $cur_query->{message} = $@;
  42. $cur_query->{message} =~ s/\n.*//g if $cur_query->{message};
  43. return _error($cur_query);
  44. };
  45. }
  46. =head2 app()
  47. Dispatches requests based on %routes built above.
  48. The dispatcher here does *not* do anything with the authn/authz data. It sets those in the 'user' and 'acls' parameters of the query object passed to routes.
  49. If a path passed is not a defined route (or regex route), but exists as a file under www/, it will be served up immediately.
  50. =cut
  51. sub _app {
  52. # Make sure all writes are with the proper permissions, none need know of our love
  53. umask 0077;
  54. INFO("TCMS starting up on PID $MASTER_PID, Worker PID $$");
  55. # Start the server timing clock
  56. my $start = [gettimeofday];
  57. # Build the routing table
  58. state( $conf, $data, %aliases );
  59. $conf //= Trog::Config::get();
  60. $data //= Trog::Data->new($conf);
  61. my %routes = %{ _routes($data) };
  62. %aliases = $data->aliases() unless %aliases;
  63. # XXX this is built progressively across the forks, leading to inconsistent behavior.
  64. # This should eventually be pre-filled from DB.
  65. my %etags;
  66. # Setup logging
  67. log_init();
  68. my $requestid = Trog::Utils::uuid();
  69. Trog::Log::uuid($requestid);
  70. # Actually start processing the request
  71. my $env = shift;
  72. # Discard the path used in the log, it's too long and enough 4xx error code = ban
  73. return _toolong( { method => $env->{REQUEST_METHOD}, fullpath => '...' } ) if length( $env->{REQUEST_URI} ) > 2048;
  74. # Various stuff important for logging requests
  75. state $domain = $conf->param('general.hostname') || $env->{HTTP_X_FORWARDED_HOST} || $env->{HTTP_HOST} || eval { Sys::Hostname::hostname() };
  76. my $path = $env->{PATH_INFO};
  77. my $port = $env->{HTTP_X_FORWARDED_PORT} // $env->{HTTP_PORT};
  78. my $pport = defined $port ? ":$port" : "";
  79. my $scheme = $env->{'psgi.url_scheme'} // 'http';
  80. my $method = $env->{REQUEST_METHOD};
  81. # It's important that we log what the user ACTUALLY requested rather than the rewritten path later on.
  82. my $fullpath = "$scheme://$domain$pport$path";
  83. # sigdie can now "do the right thing"
  84. $cur_query = { route => $path, fullpath => $path, method => $method };
  85. # Set the IP of the request so we can fail2ban
  86. $Trog::Log::ip = $env->{HTTP_X_FORWARDED_FOR} || $env->{REMOTE_ADDR};
  87. # Set the referer & ua to go into DB logs, but not logs in general.
  88. # The referer/ua largely has no importance beyond being a proto bug report for log messages.
  89. $Trog::Log::DBI::referer = $env->{HTTP_REFERER};
  90. $Trog::Log::DBI::ua = $env->{HTTP_UA};
  91. # Check eTags. If we don't know about it, just assume it's good and lazily fill the cache
  92. # XXX yes, this allows cache poisoning...but only for logged in users!
  93. if ( $env->{HTTP_IF_NONE_MATCH} ) {
  94. INFO("$env->{REQUEST_METHOD} 304 $fullpath");
  95. return [ 304, [], [''] ] if $env->{HTTP_IF_NONE_MATCH} eq ( $etags{ $env->{REQUEST_URI} } || '' );
  96. $etags{ $env->{REQUEST_URI} } = $env->{HTTP_IF_NONE_MATCH} unless exists $etags{ $env->{REQUEST_URI} };
  97. }
  98. # TODO: Actually do something with the language passed to the renderer
  99. my $lang = $env->{HTTP_ACCEPT_LANGUAGE};
  100. #TODO: Actually do something with the acceptable output formats in the renderer
  101. my $accept = $env->{HTTP_ACCEPT};
  102. # Figure out if we want compression or not
  103. my $alist = $env->{HTTP_ACCEPT_ENCODING} || '';
  104. $alist =~ s/\s//g;
  105. my @accept_encodings;
  106. @accept_encodings = split( /,/, $alist );
  107. my $deflate = grep { 'gzip' eq $_ } @accept_encodings;
  108. # NOTE These two parameters are entirely academic, as we don't use ad tracking cookies, but the UTM parameters.
  109. # UTMs are actually fully sufficient to get you what you want -- e.g. keywords, audience groups, a/b testing, etc.
  110. # and you need to put up cookie consent banners if you bother using tracking cookies, which are horrific UX.
  111. #my $no_track = $env->{HTTP_DNT};
  112. #my $no_sell_info = $env->{HTTP_SEC_GPC};
  113. # We generally prefer this to be handled at the reverse proxy level.
  114. #my $prefer_ssl = $env->{HTTP_UPGRADE_INSECURE_REQUESTS};
  115. my $last_fetch = 0;
  116. if ( $env->{HTTP_IF_MODIFIED_SINCE} ) {
  117. $last_fetch = DateTime::Format::HTTP->parse_datetime( $env->{HTTP_IF_MODIFIED_SINCE} )->epoch();
  118. }
  119. #XXX Don't use statics anything that has a search query
  120. # On one hand, I don't want to DOS the disk, but I'd also like some like ?rss...
  121. # Should probably turn those into aliases.
  122. my $has_query = !!$env->{QUERY_STRING};
  123. my $query = {};
  124. $query = URL::Encode::url_params_mixed( $env->{QUERY_STRING} ) if $env->{QUERY_STRING};
  125. #Actually parse the POSTDATA and dump it into the QUERY object if this is a POST
  126. if ( $env->{REQUEST_METHOD} eq 'POST' ) {
  127. my $body = HTTP::Body->new( $env->{CONTENT_TYPE}, $env->{CONTENT_LENGTH} );
  128. while ( $env->{'psgi.input'}->read( my $buf, $Trog::Vars::CHUNK_SIZE ) ) {
  129. $body->add($buf);
  130. }
  131. @$query{ keys( %{ $body->param } ) } = values( %{ $body->param } );
  132. @$query{ keys( %{ $body->upload } ) } = values( %{ $body->upload } );
  133. }
  134. # It's mod_rewrite!
  135. $path = '/index' if $path eq '/';
  136. #XXX this is hardcoded in browsers, so just rewrite the path
  137. $path = '/img/icon/favicon.ico' if $path eq '/favicon.ico';
  138. # Translate alias paths into their actual path
  139. $path = $aliases{$path} if exists $aliases{$path};
  140. # Collapse multiple slashes in the path
  141. $path =~ s/[\/]+/\//g;
  142. #Handle regex/capture routes
  143. if ( !exists $routes{$path} ) {
  144. my @captures;
  145. # XXX maybe this should all just go into $query?
  146. # TODO can optimize by having separate hashes for capture/non-capture routes
  147. foreach my $pattern ( keys(%routes) ) {
  148. @captures = $path =~ m/^$pattern$/;
  149. if (@captures) {
  150. $path = $pattern;
  151. foreach my $field ( @{ $routes{$path}{captures} } ) {
  152. $routes{$path}{data} //= {};
  153. $routes{$path}{data}{$field} = shift @captures;
  154. }
  155. last;
  156. }
  157. }
  158. }
  159. # Set the 'data' in the query that the route specifically overrides, which we are also using for the catpured data
  160. # This also means you have to validate both of them via parameters if you set that up.
  161. @{$query}{ keys( %{ $routes{$path}{'data'} } ) } = values( %{ $routes{$path}{'data'} } ) if ref $routes{$path}{'data'} eq 'HASH' && %{ $routes{$path}{'data'} };
  162. # Ensure any short-circuit routes can log the request, and return the server-timing headers properly
  163. $query->{method} = $method;
  164. $query->{route} = $path;
  165. $query->{fullpath} = $fullpath;
  166. $query->{start} = $start;
  167. # Handle HTTP range/streaming requests
  168. my $range = $env->{HTTP_RANGE} || "bytes=0-" if $env->{HTTP_RANGE} || $env->{HTTP_IF_RANGE};
  169. my $streaming = $env->{'psgi.streaming'};
  170. $query->{streaming} = $streaming;
  171. my @ranges;
  172. if ($range) {
  173. $range =~ s/bytes=//g;
  174. push(
  175. @ranges,
  176. map {
  177. [ split( /-/, $_ ) ];
  178. #$tuples[1] //= $tuples[0] + $Trog::Vars::CHUNK_SIZE;
  179. #\@tuples
  180. } split( /,/, $range )
  181. );
  182. }
  183. # If it's a file, just serve it
  184. return Trog::FileHandler::serve( $fullpath, "www/$path", $start, $streaming, \@ranges, $last_fetch, $deflate ) if -f "www/$path";
  185. # Figure out if we have a logged in user, so we can serve them user-specific files
  186. my $cookies = {};
  187. if ( $env->{HTTP_COOKIE} ) {
  188. $cookies = CGI::Cookie->parse( $env->{HTTP_COOKIE} );
  189. }
  190. my $active_user = '';
  191. $Trog::Log::user = 'nobody';
  192. if ( exists $cookies->{tcmslogin} ) {
  193. $active_user = Trog::Auth::session2user( $cookies->{tcmslogin}->value );
  194. $Trog::Log::user = $active_user if $active_user;
  195. }
  196. return Trog::FileHandler::serve( $fullpath, "totp/$path", $start, $streaming, \@ranges, $last_fetch, $deflate ) if -f "totp/$path" && $active_user;
  197. # Now that we have firmed up the actual routing, let's validate.
  198. return _forbidden($query) if exists $routes{$path}{auth} && !$active_user;
  199. return _notfound($query) unless exists $routes{$path} && ref $routes{$path} eq 'HASH' && keys( %{ $routes{$path} } );
  200. return _badrequest($query) unless grep { $env->{REQUEST_METHOD} eq $_ } ( $routes{$path}{method} || '', 'HEAD' );
  201. # Disallow any paths that are naughty ( starman auto-removes .. up-traversal)
  202. if ( index( $path, '/templates' ) == 0 || index( $path, '/statics' ) == 0 || $path =~ m/.*(\.psgi|\.pm)$/i ) {
  203. return _forbidden($query);
  204. }
  205. # Set the urchin parameters if necessary.
  206. %$Trog::Log::DBI::urchin = map { $_ => delete $query->{$_} } qw{utm_source utm_medium utm_campaign utm_term utm_content};
  207. # Now that we've parsed the query and know where we want to go, we should murder everything the route does not explicitly want, and validate what it does
  208. my $parameters = $routes{$path}{parameters};
  209. if ($parameters) {
  210. die "invalid route definition for $path: bad parameters" unless is_hashref($parameters);
  211. my @known_params = keys(%$parameters);
  212. for my $param (@known_params) {
  213. die "Invalid route definition for $path: parameter $param must correspond to a validation CODEREF." unless is_coderef( $parameters->{$param} );
  214. # A missing parameter is not necessarily a problem.
  215. next unless $query->{$param};
  216. # But if we have it, and it's bad, nack it, so that scanners get fail2banned.
  217. DEBUG("Rejected $fullpath for bad query param $param");
  218. return _badrequest($query) unless $parameters->{$param}->( $query->{$param} );
  219. }
  220. # Smack down passing of unnecessary fields
  221. foreach my $field ( keys(%$query) ) {
  222. next if List::Util::any { $field eq $_ } @known_params;
  223. next if List::Util::any { $field eq $_ } qw{start route streaming method fullpath};
  224. DEBUG("Rejected $fullpath for query param $field");
  225. return _badrequest($query);
  226. }
  227. }
  228. # Let's open up our default route before we bother thinking about routing any harder
  229. return $routes{default}{callback}->($query) unless -f "config/setup";
  230. $query->{user_acls} = [];
  231. $query->{user_acls} = Trog::Auth::acls4user($active_user) // [] if $active_user;
  232. # Grab the list of ACLs we want to add to a post, if any.
  233. $query->{acls} = [ $query->{acls} ] if ( $query->{acls} && ref $query->{acls} ne 'ARRAY' );
  234. # Filter out passed ACLs which are naughty
  235. my $is_admin = grep { $_ eq 'admin' } @{ $query->{user_acls} };
  236. @{ $query->{acls} } = grep { $_ ne 'admin' } @{ $query->{acls} } unless $is_admin;
  237. # If we have a static render, just use it instead (These will ALWAYS be correct, data saves invalidate this)
  238. # TODO: make this key on admin INSTEAD of active user when we add non-admin users.
  239. if ( !$active_user && !$has_query ) {
  240. return _static( $fullpath, "$path.z", $start, $streaming ) if -f "www/statics/$path.z" && $deflate;
  241. return _static( $fullpath, $path, $start, $streaming ) if -f "www/statics/$path";
  242. }
  243. $query->{deflate} = $deflate;
  244. $query->{user} = $active_user;
  245. #Set various things we don't want overridden
  246. $query->{body} = '';
  247. $query->{dnt} = $env->{HTTP_DNT};
  248. $query->{user} = $active_user;
  249. $query->{domain} = $domain;
  250. $query->{route} = $path;
  251. $query->{scheme} = $scheme;
  252. $query->{social_meta} = 1;
  253. $query->{primary_post} = {};
  254. $query->{has_query} = $has_query;
  255. $query->{port} = $port;
  256. $query->{lang} = $lang;
  257. $query->{accept} = $accept;
  258. # Redirecting somewhere naughty not allow
  259. $query->{to} = URI->new( $query->{to} // '' )->path() || $query->{to} if $query->{to};
  260. DEBUG("DISPATCH $path to $routes{$path}{callback}");
  261. #XXX there is a trick to now use strict refs, but I don't remember it right at the moment
  262. {
  263. no strict 'refs';
  264. my $output = $routes{$path}{callback}->($query);
  265. die "$path returned no data!" unless ref $output eq 'ARRAY' && @$output == 3;
  266. my $pport = defined $query->{port} ? ":$query->{port}" : "";
  267. INFO("$env->{REQUEST_METHOD} $output->[0] $fullpath");
  268. # Append server-timing headers if they aren't present
  269. my $tot = tv_interval($start) * 1000;
  270. push( @{ $output->[1] }, 'Server-Timing' => "app;dur=$tot" ) unless List::Util::any { $_ eq 'Server-Timing' } @{ $output->[1] };
  271. return $output;
  272. }
  273. }
  274. #XXX Return a clone of the routing table ref, because code modifies it later
  275. sub _routes ( $data = {} ) {
  276. state %routes;
  277. return clone( \%routes ) if %routes;
  278. if ( !$data ) {
  279. my $conf = Trog::Config::get();
  280. $data = Trog::Data->new($conf);
  281. }
  282. my %roots = $data->routes();
  283. %routes = %Trog::Routes::HTML::routes;
  284. @routes{ keys(%Trog::Routes::JSON::routes) } = values(%Trog::Routes::JSON::routes);
  285. @routes{ keys(%roots) } = values(%roots);
  286. # Add in global routes, here because they *must* know about all other routes
  287. # Also, nobody should ever override these.
  288. $routes{'/robots.txt'} = {
  289. method => 'GET',
  290. callback => \&robots,
  291. };
  292. return clone( \%routes );
  293. }
  294. =head2 robots
  295. Return an appropriate robots.txt
  296. This is a "special" route as it needs to know about all the routes in order to disallow noindex=1 routes.
  297. =cut
  298. sub robots ($query) {
  299. state $etag = "robots-" . time();
  300. my $routes = _routes();
  301. # If there's a 'capture' route, we need to format it correctly.
  302. state @banned = map { exists $routes->{$_}{robot_name} ? $routes->{$_}{robot_name} : $_ } grep { $routes->{$_}{noindex} } sort keys(%$routes);
  303. return Trog::Renderer->render(
  304. contenttype => 'text/plain',
  305. template => 'robots.tx',
  306. data => {
  307. etag => $etag,
  308. banned => \@banned,
  309. %$query,
  310. },
  311. code => 200,
  312. );
  313. }
  314. sub _generic ( $type, $query ) {
  315. return _static( "$type.z", $query->{start}, $query->{streaming} ) if -f "www/statics/$type.z";
  316. return _static( $type, $query->{start}, $query->{streaming} ) if -f "www/statics/$type";
  317. my %lookup = (
  318. notfound => \&Trog::Routes::HTML::notfound,
  319. forbidden => \&Trog::Routes::HTML::forbidden,
  320. badrequest => \&Trog::Routes::HTML::badrequest,
  321. toolong => \&Trog::Routes::HTML::toolong,
  322. error => \&Trog::Routes::HTML::error,
  323. );
  324. return $lookup{$type}->($query);
  325. }
  326. sub _notfound ($query) {
  327. INFO("$query->{method} 404 $query->{fullpath}");
  328. return _generic( 'notfound', $query );
  329. }
  330. sub _forbidden ($query) {
  331. INFO("$query->{method} 403 $query->{fullpath}");
  332. return _generic( 'forbidden', $query );
  333. }
  334. sub _badrequest ($query) {
  335. INFO("$query->{method} 400 $query->{fullpath}");
  336. return _generic( 'badrequest', $query );
  337. }
  338. sub _toolong ($query) {
  339. INFO("$query->{method} 419 $query->{fullpath}");
  340. return _generic( 'toolong', {} );
  341. }
  342. sub _error ($query) {
  343. $query->{method} //= "UNKNOWN";
  344. $query->{fullpath} //= $query->{route} // '/?';
  345. INFO("$query->{method} 500 $query->{fullpath}");
  346. return _generic( 'error', $query );
  347. }
  348. sub _static ( $fullpath, $path, $start, $streaming, $last_fetch = 0 ) {
  349. DEBUG("Rendering static for $path");
  350. # XXX because of psgi I can't just vomit the file directly
  351. if ( open( my $fh, '<', "www/statics/$path" ) ) {
  352. my $headers = '';
  353. # NOTE: this is relying on while advancing the file pointer
  354. while (<$fh>) {
  355. last if $_ eq "\n";
  356. $headers .= $_;
  357. }
  358. my ( undef, undef, $status, undef, $headers_parsed ) = HTTP::Parser::XS::parse_http_response( "$headers\n", HEADERS_AS_HASHREF );
  359. #XXX need to put this into the file itself
  360. my $mt = ( stat($fh) )[9];
  361. my @gm = gmtime($mt);
  362. my $now_string = strftime( "%a, %d %b %Y %H:%M:%S GMT", @gm );
  363. my $code = $mt > $last_fetch ? $status : 304;
  364. $headers_parsed->{"Last-Modified"} = $now_string;
  365. # Append server-timing headers
  366. my $tot = tv_interval($start) * 1000;
  367. $headers_parsed->{'Server-Timing'} = "static;dur=$tot";
  368. #XXX uwsgi just opens the file *again* when we already have a filehandle if it has a path.
  369. # starman by comparison doesn't violate the principle of least astonishment here.
  370. # This is probably a performance optimization, but makes the kind of micromanagement I need to do inconvenient.
  371. # As such, we will just return a stream.
  372. INFO("GET 200 $fullpath");
  373. return sub {
  374. my $responder = shift;
  375. #push(@headers, 'Content-Length' => $sz);
  376. my $writer = $responder->( [ $code, [%$headers_parsed] ] );
  377. while ( $fh->read( my $buf, $Trog::Vars::CHUNK_SIZE ) ) {
  378. $writer->write($buf);
  379. }
  380. close $fh;
  381. $writer->close;
  382. }
  383. if $streaming;
  384. return [ $code, [%$headers_parsed], $fh ];
  385. }
  386. INFO("GET 403 $fullpath");
  387. return [ 403, [ 'Content-Type' => $Trog::Vars::content_types{text} ], ["STAY OUT YOU RED MENACE"] ];
  388. }
  389. 1;