Typography.php

Go to the documentation of this file.
00001 <?php  if ( ! defined('BASEPATH')) exit('No direct script access allowed');
00002 /**
00003  * CodeIgniter
00004  *
00005  * An open source application development framework for PHP 4.3.2 or newer
00006  *
00007  * @package             CodeIgniter
00008  * @author              ExpressionEngine Dev Team
00009  * @copyright   Copyright (c) 2008, EllisLab, Inc.
00010  * @license             http://codeigniter.com/user_guide/license.html
00011  * @link                http://codeigniter.com
00012  * @since               Version 1.0
00013  * @filesource
00014  */
00015 
00016 // ------------------------------------------------------------------------
00017 
00018 /**
00019  * Typography Class
00020  *
00021  *
00022  * @access              private
00023  * @category    Helpers
00024  * @author              ExpressionEngine Dev Team
00025  * @link                http://codeigniter.com/user_guide/helpers/
00026  */
00027 class CI_Typography {
00028 
00029         // Block level elements that should not be wrapped inside <p> tags
00030         var $block_elements = 'address|blockquote|div|dl|fieldset|form|h\d|hr|noscript|object|ol|p|pre|script|table|ul';
00031         
00032         // Elements that should not have <p> and <br /> tags within them.
00033         var $skip_elements      = 'p|pre|ol|ul|dl|object|table';
00034         
00035         // Tags we want the parser to completely ignore when splitting the string.
00036         var $inline_elements = 'a|abbr|acronym|b|bdo|br|button|cite|code|del|dfn|em|i|img|ins|input|label|map|kbd|samp|select|span|strong|sub|sup|textarea|var';
00037 
00038         // whether or not to protect quotes within { curly braces }
00039         var $protect_braced_quotes = FALSE;
00040         
00041         /**
00042          * Nothing to do here...
00043          *
00044          */
00045         function CI_Typography()
00046         {
00047         }
00048 
00049         /**
00050          * Auto Typography
00051          *
00052          * This function converts text, making it typographically correct:
00053          *      - Converts double spaces into paragraphs.
00054          *      - Converts single line breaks into <br /> tags
00055          *      - Converts single and double quotes into correctly facing curly quote entities.
00056          *      - Converts three dots into ellipsis.
00057          *      - Converts double dashes into em-dashes.
00058          *  - Converts two spaces into entities
00059          *
00060          * @access      public
00061          * @param       string
00062          * @param       bool    whether to strip javascript event handlers for security
00063          * @param       bool    whether to reduce more then two consecutive newlines to two
00064          * @return      string
00065          */
00066         function auto_typography($str, $strip_js_event_handlers = TRUE, $reduce_linebreaks = FALSE)
00067         {
00068                 if ($str == '')
00069                 {
00070                         return '';
00071                 }
00072 
00073                 // Standardize Newlines to make matching easier
00074                 if (strpos($str, "\r") !== FALSE)
00075                 {
00076                         $str = str_replace(array("\r\n", "\r"), "\n", $str);                    
00077                 }
00078                         
00079                 // Reduce line breaks.  If there are more than two consecutive linebreaks
00080                 // we'll compress them down to a maximum of two since there's no benefit to more.
00081                 if ($reduce_linebreaks === TRUE)
00082                 {
00083                         $str = preg_replace("/\n\n+/", "\n\n", $str);
00084                 }
00085                 
00086                  // Do we allow JavaScript event handlers? If not, we strip them from within all tags
00087                 if ($strip_js_event_handlers === TRUE)
00088                 {
00089                         $str = preg_replace("#<([^><]+?)([^a-z_\-]on\w*|xmlns)(\s*=\s*[^><]*)([><]*)#i", "<\\1\\4", $str);
00090                 }       
00091 
00092                 // Convert quotes within tags to temporary markers. We don't want quotes converted 
00093                 // within tags so we'll temporarily convert them to {@DQ} and {@SQ}
00094                 if (preg_match_all("#<.+?>#si", $str, $matches))
00095                 {
00096                         for ($i = 0; $i < count($matches['0']); $i++)
00097                         {
00098                                 $str = str_replace($matches['0'][$i],
00099                                                                         str_replace(array("'",'"'), array('{@SQ}', '{@DQ}'), $matches['0'][$i]),
00100                                                                         $str);
00101                         }
00102                 }
00103 
00104                 if ($this->protect_braced_quotes === TRUE)
00105                 {
00106                         if (preg_match_all("#\{.+?}#si", $str, $matches))
00107                         {
00108                                 for ($i = 0; $i < count($matches['0']); $i++)
00109                                 {
00110                                         $str = str_replace($matches['0'][$i],
00111                                                                                 str_replace(array("'",'"'), array('{@SQ}', '{@DQ}'), $matches['0'][$i]),
00112                                                                                 $str);
00113                                 }
00114                         }                       
00115                 }
00116                         
00117                 // Convert "ignore" tags to temporary marker.  The parser splits out the string at every tag 
00118                 // it encounters.  Certain inline tags, like image tags, links, span tags, etc. will be 
00119                 // adversely affected if they are split out so we'll convert the opening bracket < temporarily to: {@TAG}
00120                 $str = preg_replace("#<(/*)(".$this->inline_elements.")([ >])#i", "{@TAG}\\1\\2\\3", $str);
00121 
00122                 // Split the string at every tag.  This expression creates an array with this prototype:
00123                 // 
00124                 //      [array]
00125                 //      {
00126                 //              [0] = <opening tag>
00127                 //              [1] = Content...
00128                 //              [2] = <closing tag>
00129                 //              Etc...
00130                 //      }       
00131                 $chunks = preg_split('/(<(?:[^<>]+(?:"[^"]*"|\'[^\']*\')?)+>)/', $str, -1, PREG_SPLIT_DELIM_CAPTURE|PREG_SPLIT_NO_EMPTY);
00132                 
00133                 // Build our finalized string.  We cycle through the array, skipping tags, and processing the contained text    
00134                 $str = '';
00135                 $process = TRUE;
00136                 $paragraph = FALSE;
00137                 foreach ($chunks as $chunk)
00138                 {
00139                         // Are we dealing with a tag? If so, we'll skip the processing for this cycle.
00140                         // Well also set the "process" flag which allows us to skip <pre> tags and a few other things.
00141                         if (preg_match("#<(/*)(".$this->block_elements.").*?>#", $chunk, $match))
00142                         {
00143                                 if (preg_match("#".$this->skip_elements."#", $match[2]))
00144                                 {
00145                                         $process =  ($match[1] == '/') ? TRUE : FALSE;
00146                                 }
00147                                 
00148                                 $str .= $chunk;
00149                                 continue;
00150                         }
00151 
00152                         if ($process == FALSE)
00153                         {
00154                                 $str .= $chunk;
00155                                 continue;
00156                         }
00157                         
00158                         //  Convert Newlines into <p> and <br /> tags
00159                         $str .= $this->_format_newlines($chunk);
00160                 }
00161 
00162                 // is the whole of the content inside a block level element?
00163                 if ( ! preg_match("/^<(?:".$this->block_elements.")/i", $str, $match))
00164                 {
00165                         $str = "<p>{$str}</p>";
00166                 }
00167 
00168                 // Convert quotes, elipsis, and em-dashes
00169                 $str = $this->format_characters($str);
00170         
00171                 // Final clean up
00172                 $table = array(
00173                 
00174                                                 // If the user submitted their own paragraph tags within the text
00175                                                 // we will retain them instead of using our tags.
00176                                                 '/(<p.*?>)<p>/'         => '$1', // <?php BBEdit syntax coloring bug fix
00177                                                 
00178                                                 // Reduce multiple instances of opening/closing paragraph tags to a single one
00179                                                 '#(</p>)+#'                     => '</p>',
00180                                                 '/(<p><p>)+/'           => '<p>',
00181                                                 
00182                                                 // Clean up stray paragraph tags that appear before block level elements
00183                                                 '#<p></p><('.$this->block_elements.')#' => '<$1',
00184                         
00185                                                 // Replace the temporary markers we added earlier
00186                                                 '/\{@TAG\}/'            => '<',
00187                                                 '/\{@DQ\}/'                     => '"',
00188                                                 '/\{@SQ\}/'                     => "'"
00189 
00190                                                 );
00191         
00192                 // Do we need to reduce empty lines?
00193                 if ($reduce_linebreaks === TRUE)
00194                 {
00195                         $table['#<p>\n*</p>#'] = '';
00196                 }
00197                 else
00198                 {
00199                         // If we have empty paragraph tags we add a non-breaking space
00200                         // otherwise most browsers won't treat them as true paragraphs
00201                         $table['#<p></p>#'] = '<p>&nbsp;</p>';
00202                 }
00203         
00204                 return preg_replace(array_keys($table), $table, $str);
00205 
00206         }
00207         
00208         // --------------------------------------------------------------------
00209 
00210         /**
00211          * Format Characters
00212          *
00213          * This function mainly converts double and single quotes
00214          * to curly entities, but it also converts em-dashes,
00215          * double spaces, and ampersands
00216          *
00217          * @access      public
00218          * @param       string
00219          * @return      string
00220          */
00221         function format_characters($str)
00222         {
00223                 static $table;
00224                 
00225                 if ( ! isset($table))
00226                 {
00227                 $table = array(                                 
00228                                                         // nested smart quotes, opening and closing
00229                                                         // note that rules for grammar (English) allow only for two levels deep
00230                                                         // and that single quotes are _supposed_ to always be on the outside
00231                                                         // but we'll accommodate both
00232                                                         '/(^|\W|\s)\'"/'                                => '$1&#8216;&#8220;',
00233                                                         '/\'"(\s|\W|$)/'                                => '&#8217;&#8221;$1',
00234                                                         '/(^|\W|\s)"\'/'                                => '$1&#8220;&#8216;',
00235                                                         '/"\'(\s|\W|$)/'                                => '&#8221;&#8217;$1',
00236 
00237                                                         // single quote smart quotes
00238                                                         '/\'(\s|\W|$)/'                                 => '&#8217;$1',
00239                                                         '/(^|\W|\s)\'/'                                 => '$1&#8216;',
00240 
00241                                                         // double quote smart quotes
00242                                                         '/"(\s|\W|$)/'                                  => '&#8221;$1',
00243                                                         '/(^|\W|\s)"/'                                  => '$1&#8220;',
00244 
00245                                                         // apostrophes
00246                                                         "/(\w)'(\w)/"                   => '$1&#8217;$2',
00247 
00248                                                         // Em dash and ellipses dots
00249                                                         '/\s?\-\-\s?/'                                  => '&#8212;',
00250                                                         '/(\w)\.{3}/'                                   => '$1&#8230;',
00251 
00252                                                         // double space after sentences
00253                                                         '/(\W)  /'                                              => '$1&nbsp; ',
00254 
00255                                                         // ampersands, if not a character entity
00256                                                         '/&(?!#?[a-zA-Z0-9]{2,};)/'             => '&amp;'
00257                                         );                      
00258                 }       
00259 
00260                 return preg_replace(array_keys($table), $table, $str);
00261         }
00262         
00263         // --------------------------------------------------------------------
00264 
00265         /**
00266          * Format Newlines
00267          *
00268          * Converts newline characters into either <p> tags or <br />
00269          *
00270          * @access      public
00271          * @param       string
00272          * @return      string
00273          */     
00274         function _format_newlines($str)
00275         {
00276                 if ($str == '')
00277                 {
00278                         return $str;
00279                 }
00280 
00281                 if (strpos($str, "\n") === FALSE)
00282                 {
00283                         return $str;
00284                 }
00285                 
00286                 // Convert two consecutive newlines to paragraphs
00287                 $str = str_replace("\n\n", "</p>\n\n<p>", $str);
00288                 
00289                 // Convert single spaces to <br /> tags
00290                 $str = preg_replace("/([^\n])(\n)([^\n])/", "\\1<br />\\2\\3", $str);
00291                 
00292                 // Wrap the whole enchilada in enclosing paragraphs
00293                 if ($str != "\n")
00294                 {
00295                         $str =  '<p>'.$str.'</p>';
00296                 }
00297 
00298                 // Remove empty paragraphs if they are on the first line, as this
00299                 // is a potential unintended consequence of the previous code
00300                 $str = preg_replace("/<p><\/p>(.*)/", "\\1", $str, 1);
00301                 
00302                 return $str;
00303         }
00304         
00305         // ------------------------------------------------------------------------
00306         
00307         /**
00308          * Convert newlines to HTML line breaks except within PRE tags
00309          *
00310          * @access      public
00311          * @param       string
00312          * @return      string
00313          */             
00314         function nl2br_except_pre($str)
00315         {
00316                 $ex = explode("pre>",$str);
00317                 $ct = count($ex);
00318         
00319                 $newstr = "";
00320                 for ($i = 0; $i < $ct; $i++)
00321                 {
00322                         if (($i % 2) == 0)
00323                         {
00324                                 $newstr .= nl2br($ex[$i]);
00325                         }
00326                         else
00327                         {
00328                                 $newstr .= $ex[$i];
00329                         }
00330                 
00331                         if ($ct - 1 != $i)
00332                                 $newstr .= "pre>";
00333                 }
00334         
00335                 return $newstr;
00336         }
00337         
00338 }
00339 // END Typography Class
00340 
00341 /* End of file Typography.php */
00342 /* Location: ./system/libraries/Typography.php */