CSC212 Splitting text into words while stripping punctuation marks
--D. Thiebaut (talk) 07:49, 2 November 2014 (EST)
This is a quick introduction illustrating how to use a regular expression to split text into words in Java.
Source Code
public class RegExpTester {
public static void main(String[] args) {
String text = "The\tquick red fox jumped over\nthe lazy brown sleeping dog.";
System.out.println( "===> text = " + text );
// replace letters
String newText = text.replaceAll( "i", "I" );
System.out.println( "\n\nreplace i with I: " + newText );
// replace strings
newText = text.replaceAll( "the", "a" );
System.out.println( "\n\nreplace \"the\" with \"a\":\n" + newText );
// replace spaces
newText = text.replaceAll( "[Tt]he", "a" );
System.out.println( "\n\nreplace \"[Tt]he\" with \"a\":\n" + newText );
// replace non-vowels by #
newText = text.replaceAll( "[^aeiouy]", "#" );
System.out.println( "\n\nreplace non-vowels with #:\n" + newText );
// replace non-vowels by #
newText = text.replaceAll( "[^aeiouy ]", "#" );
System.out.println( "\n\nreplace non-vowels and spaces with #:\n" + newText );
// replace white space with simple space
newText = text.replaceAll( "\\s", " " );
System.out.println( "\n\nreplace white space with simple space:\n" + newText );
// replace white space with simple space, then non-vowels with #
newText = text.replaceAll( "\\s", " " ).replaceAll( "[^aeiouy ]", "#" );
System.out.println( "\n\nreplace white space with simple space, and non-vowels with #:\n" + newText );
// update text
text = "The\tquick red fox jumped over\n\n\tthe lazy brown sleeping dog.";
System.out.println( "\n\n----------------------------------------------------" );
System.out.println( "===> text = " + text );
System.out.println( "----------------------------------------------------\n\n" );
// replace white space with simple space, then non-vowels with #
newText = text.replaceAll( "\\s", " " ).replaceAll( "[^aeiouy ]", "#" );
System.out.println( "\n\nreplace white space with simple space, and non-vowels with #:\n" + newText );
// replace white space with simple space, then non-vowels with #
newText = text.replaceAll( "\\s+", " " ).replaceAll( "[^aeiouy ]", "#" );
System.out.println( "\n\nreplace white space with simple space, and non-vowels with #:\n" + newText );
// update text
text = "Man! It's a bird! No; wait; Ah, it's a plane... No!?! It's... it's... Superman!";
System.out.println( "\n\n----------------------------------------------------" );
System.out.println( "===> text = " + text );
System.out.println( "----------------------------------------------------\n\n" );
// remove non-alphabetic characters from text
newText = text.toLowerCase().replaceAll( "[^a-zA-Z\\s]", "" );
System.out.println( "\n\nreplace non alphanumeric characters#:\n" + newText );
// remove non-alphabetic chars and remove extra white spaces
newText = text.toLowerCase().replaceAll( "\\s+", " " ).replaceAll( "[^a-zA-Z\\s]", "" );
System.out.println( "\n\nreplace non alphanumeric characters#:\n" + newText );
// split the previous string into words.
newText = text.toLowerCase().replaceAll( "[^a-zA-Z\\s]", "" );
String[] words = newText.split( "\\s+" );
for ( int i=0; i<words.length; i++ )
System.out.println( "words[" + i + "] = " + words[i] );
}
}
Output
public class RegExpTester {
public static void main(String[] args) {
String text = "The\tquick red fox jumped over\nthe lazy brown sleeping dog.";
System.out.println( "===> text = " + text );
// replace letters
String newText = text.replaceAll( "i", "I" );
System.out.println( "\n\nreplace i with I: " + newText );
// replace strings
newText = text.replaceAll( "the", "a" );
System.out.println( "\n\nreplace \"the\" with \"a\":\n" + newText );
// replace spaces
newText = text.replaceAll( "[Tt]he", "a" );
System.out.println( "\n\nreplace \"[Tt]he\" with \"a\":\n" + newText );
// replace non-vowels by #
newText = text.replaceAll( "[^aeiouy]", "#" );
System.out.println( "\n\nreplace non-vowels with #:\n" + newText );
// replace non-vowels by #
newText = text.replaceAll( "[^aeiouy ]", "#" );
System.out.println( "\n\nreplace non-vowels and spaces with #:\n" + newText );
// replace white space with simple space
newText = text.replaceAll( "\\s", " " );
System.out.println( "\n\nreplace white space with simple space:\n" + newText );
// replace white space with simple space, then non-vowels with #
newText = text.replaceAll( "\\s", " " ).replaceAll( "[^aeiouy ]", "#" );
System.out.println( "\n\nreplace white space with simple space, and non-vowels with #:\n" + newText );
// update text
text = "The\tquick red fox jumped over\n\n\tthe lazy brown sleeping dog.";
System.out.println( "\n\n----------------------------------------------------" );
System.out.println( "===> text = " + text );
System.out.println( "----------------------------------------------------\n\n" );
// replace white space with simple space, then non-vowels with #
newText = text.replaceAll( "\\s", " " ).replaceAll( "[^aeiouy ]", "#" );
System.out.println( "\n\nreplace white space with simple space, and non-vowels with #:\n" + newText );
// replace white space with simple space, then non-vowels with #
newText = text.replaceAll( "\\s+", " " ).replaceAll( "[^aeiouy ]", "#" );
System.out.println( "\n\nreplace white space with simple space, and non-vowels with #:\n" + newText );
// update text
text = "Man! It's a bird! No; wait; Ah, it's a plane... No!?! It's... it's... Superman!";
System.out.println( "\n\n----------------------------------------------------" );
System.out.println( "===> text = " + text );
System.out.println( "----------------------------------------------------\n\n" );
// remove non-alphabetic characters from text
newText = text.toLowerCase().replaceAll( "[^a-zA-Z\\s]", "" );
System.out.println( "\n\nreplace non alphanumeric characters:\n" + newText );
// remove non-alphabetic chars and remove extra white spaces
newText = text.toLowerCase().replaceAll( "\\s+", " " ).replaceAll( "[^a-zA-Z\\s]", "" );
System.out.println( "\n\nreplace non alphanumeric characters and remove extra white spaces:\n" + newText );
// split the previous string into words.
newText = text.toLowerCase().replaceAll( "[^a-zA-Z\\s]", "" );
System.out.println( "\nOutput list of single words in text:" );
String[] words = newText.split( "\\s+" );
for ( int i=0; i<words.length; i++ )
System.out.println( "words[" + i + "] = " + words[i] );
}
}