CSC212 Splitting text into words while stripping punctuation marks

From dftwiki3
Jump to: navigation, search

--D. Thiebaut (talk) 07:49, 2 November 2014 (EST)




This is a quick introduction illustrating how to use a regular expression to split text into words in Java.


Source Code


public class RegExpTester {

	public static void main(String[] args) {

		String text = "The\tquick red fox jumped over\nthe lazy brown sleeping dog.";
		System.out.println( "===> text = " + text );
		
		// replace letters
		String newText = text.replaceAll( "i",  "I" );
		System.out.println( "\n\nreplace i with I: " + newText );

		// replace strings
		newText = text.replaceAll( "the",  "a" );
		System.out.println( "\n\nreplace \"the\" with \"a\":\n" + newText );

		// replace spaces
		newText = text.replaceAll( "[Tt]he", "a" );
		System.out.println( "\n\nreplace \"[Tt]he\" with \"a\":\n" + newText );

		// replace non-vowels by #
		newText = text.replaceAll( "[^aeiouy]", "#" );
		System.out.println( "\n\nreplace non-vowels with #:\n" + newText );

		// replace non-vowels by #
		newText = text.replaceAll( "[^aeiouy ]", "#" );
		System.out.println( "\n\nreplace non-vowels and spaces with #:\n" + newText );

		// replace white space with simple space
		newText = text.replaceAll( "\\s", " " );
		System.out.println( "\n\nreplace white space with simple space:\n" + newText );

		// replace white space with simple space, then non-vowels with #
		newText = text.replaceAll( "\\s", " " ).replaceAll( "[^aeiouy ]", "#" );
		System.out.println( "\n\nreplace white space with simple space, and non-vowels with #:\n" + newText );

		// update text
		text = "The\tquick     red   fox jumped over\n\n\tthe     lazy  brown sleeping dog.";
		System.out.println( "\n\n----------------------------------------------------" );
		System.out.println( "===> text = " + text );
		System.out.println( "----------------------------------------------------\n\n" );

		// replace white space with simple space, then non-vowels with #
		newText = text.replaceAll( "\\s", " " ).replaceAll( "[^aeiouy ]", "#" );
		System.out.println( "\n\nreplace white space with simple space, and non-vowels with #:\n" + newText );

		// replace white space with simple space, then non-vowels with #
		newText = text.replaceAll( "\\s+", " " ).replaceAll( "[^aeiouy ]", "#" );
		System.out.println( "\n\nreplace white space with simple space, and non-vowels with #:\n" + newText );

		// update text
		text = "Man!  It's a bird!    No; wait; Ah, it's a plane...  No!?!   It's... it's...   Superman!";
		System.out.println( "\n\n----------------------------------------------------" );
		System.out.println( "===> text = " + text );
		System.out.println( "----------------------------------------------------\n\n" );

		// remove non-alphabetic characters from text
		newText = text.toLowerCase().replaceAll( "[^a-zA-Z\\s]", "" );
		System.out.println( "\n\nreplace non alphanumeric characters:\n" + newText );

		// remove non-alphabetic chars and remove extra white spaces
		newText = text.toLowerCase().replaceAll( "\\s+", " " ).replaceAll( "[^a-zA-Z\\s]", "" );
		System.out.println( "\n\nreplace non alphanumeric characters and remove extra white spaces:\n" + newText );
		
		// split the previous string into words.
		newText = text.toLowerCase().replaceAll( "[^a-zA-Z\\s]", "" );
		System.out.println( "\nOutput list of single words in text:" );
		String[] words = newText.split( "\\s+" );
		for ( int i=0; i<words.length; i++ )
			System.out.println( "words[" + i + "] = " + words[i] );
	}

}


Output


===> text = The	quick red fox jumped over
the lazy brown sleeping dog.


replace i with I: The	quIck red fox jumped over
the lazy brown sleepIng dog.


replace "the" with "a":
The	quick red fox jumped over
a lazy brown sleeping dog.


replace "[Tt]he" with "a":
a	quick red fox jumped over
a lazy brown sleeping dog.


replace non-vowels with #:
##e##ui####e###o###u##e##o#e####e##a#y###o#####ee#i####o##


replace non-vowels and spaces with #:
##e##ui## #e# #o# #u##e# o#e####e #a#y ##o## ##ee#i## #o##


replace white space with simple space:
The quick red fox jumped over the lazy brown sleeping dog.


replace white space with simple space, and non-vowels with #:
##e #ui## #e# #o# #u##e# o#e# ##e #a#y ##o## ##ee#i## #o##


----------------------------------------------------
===> text = The	quick     red   fox jumped over

	the     lazy  brown sleeping dog.
----------------------------------------------------




replace white space with simple space, and non-vowels with #:
##e #ui##     #e#   #o# #u##e# o#e#   ##e     #a#y  ##o## ##ee#i## #o##


replace white space with simple space, and non-vowels with #:
##e #ui## #e# #o# #u##e# o#e# ##e #a#y ##o## ##ee#i## #o##


----------------------------------------------------
===> text = Man!  It's a bird!    No; wait; Ah, it's a plane...  No!?!   It's... it's...   Superman!
----------------------------------------------------




replace non alphanumeric characters:
man  its a bird    no wait ah its a plane  no   its its   superman


replace non alphanumeric characters and remove extra white spaces:
man its a bird no wait ah its a plane no its its superman

Output list of single words in text:
words[0] = man
words[1] = its
words[2] = a
words[3] = bird
words[4] = no
words[5] = wait
words[6] = ah
words[7] = its
words[8] = a
words[9] = plane
words[10] = no
words[11] = its
words[12] = its
words[13] = superman